From f4098c3371d6e007d30443bc7c72df328de8c0bd Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Mon, 11 Nov 2019 23:51:44 +0100 Subject: [PATCH 001/185] TST: filter warnings for is_extension_type deprecation (#29549) --- pandas/core/dtypes/common.py | 1 + pandas/tests/dtypes/test_common.py | 30 ++++++++++++++++++++++++++++++ 2 files changed, 31 insertions(+) diff --git a/pandas/core/dtypes/common.py b/pandas/core/dtypes/common.py index 5180f513dfed0..4f9481eccb836 100644 --- a/pandas/core/dtypes/common.py +++ b/pandas/core/dtypes/common.py @@ -1508,6 +1508,7 @@ def is_extension_type(arr): Check whether an array-like is of a pandas extension class instance. .. deprecated:: 1.0.0 + Use ``is_extension_array_dtype`` instead. Extension classes include categoricals, pandas sparse objects (i.e. classes represented within the pandas library and not ones external diff --git a/pandas/tests/dtypes/test_common.py b/pandas/tests/dtypes/test_common.py index 7abaa0651449e..d8420673104d5 100644 --- a/pandas/tests/dtypes/test_common.py +++ b/pandas/tests/dtypes/test_common.py @@ -549,6 +549,7 @@ def test_is_bool_dtype(): assert com.is_bool_dtype(pd.Index([True, False])) +@pytest.mark.filterwarnings("ignore:'is_extension_type' is deprecated:FutureWarning") @pytest.mark.parametrize( "check_scipy", [False, pytest.param(True, marks=td.skip_if_no_scipy)] ) @@ -573,6 +574,35 @@ def test_is_extension_type(check_scipy): assert not com.is_extension_type(scipy.sparse.bsr_matrix([1, 2, 3])) +def test_is_extension_type_deprecation(): + with tm.assert_produces_warning(FutureWarning): + com.is_extension_type([1, 2, 3]) + + +@pytest.mark.parametrize( + "check_scipy", [False, pytest.param(True, marks=td.skip_if_no_scipy)] +) +def test_is_extension_array_dtype(check_scipy): + assert not com.is_extension_array_dtype([1, 2, 3]) + assert not com.is_extension_array_dtype(np.array([1, 2, 3])) + assert not com.is_extension_array_dtype(pd.DatetimeIndex([1, 2, 3])) + + cat = pd.Categorical([1, 2, 3]) + assert com.is_extension_array_dtype(cat) + assert com.is_extension_array_dtype(pd.Series(cat)) + assert com.is_extension_array_dtype(pd.SparseArray([1, 2, 3])) + assert com.is_extension_array_dtype(pd.DatetimeIndex(["2000"], tz="US/Eastern")) + + dtype = DatetimeTZDtype("ns", tz="US/Eastern") + s = pd.Series([], dtype=dtype) + assert com.is_extension_array_dtype(s) + + if check_scipy: + import scipy.sparse + + assert not com.is_extension_array_dtype(scipy.sparse.bsr_matrix([1, 2, 3])) + + def test_is_complex_dtype(): assert not com.is_complex_dtype(int) assert not com.is_complex_dtype(str) From 5c36aa1305b07b1d6da4c2a63b6d5b9887503699 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Mon, 11 Nov 2019 15:08:29 -0800 Subject: [PATCH 002/185] TYPES: __len__, is_all_dates, inferred_type (#29475) --- pandas/_libs/hashtable_class_helper.pxi.in | 10 +++++----- pandas/_libs/internals.pyx | 2 +- pandas/core/arrays/categorical.py | 2 +- pandas/core/arrays/datetimelike.py | 2 +- pandas/core/arrays/integer.py | 2 +- pandas/core/arrays/interval.py | 2 +- pandas/core/computation/expr.py | 2 +- pandas/core/frame.py | 2 +- pandas/core/generic.py | 2 +- pandas/core/groupby/groupby.py | 2 +- pandas/core/indexes/base.py | 6 ++++-- pandas/core/indexes/category.py | 2 +- pandas/core/indexes/datetimes.py | 4 ++-- pandas/core/indexes/interval.py | 6 +++--- pandas/core/indexes/multi.py | 6 +++--- pandas/core/indexes/numeric.py | 8 ++++---- pandas/core/indexes/period.py | 4 ++-- pandas/core/indexes/range.py | 2 +- pandas/core/indexes/timedeltas.py | 4 ++-- pandas/core/internals/blocks.py | 2 +- pandas/core/internals/managers.py | 2 +- pandas/core/series.py | 2 +- pandas/io/pytables.py | 2 +- pandas/tests/reshape/test_concat.py | 2 +- scripts/validate_docstrings.py | 2 +- 25 files changed, 42 insertions(+), 40 deletions(-) diff --git a/pandas/_libs/hashtable_class_helper.pxi.in b/pandas/_libs/hashtable_class_helper.pxi.in index c39d6d60d4ea5..b207fcb66948d 100644 --- a/pandas/_libs/hashtable_class_helper.pxi.in +++ b/pandas/_libs/hashtable_class_helper.pxi.in @@ -100,7 +100,7 @@ cdef class {{name}}Vector: PyMem_Free(self.data) self.data = NULL - def __len__(self): + def __len__(self) -> int: return self.data.n cpdef to_array(self): @@ -168,7 +168,7 @@ cdef class StringVector: PyMem_Free(self.data) self.data = NULL - def __len__(self): + def __len__(self) -> int: return self.data.n def to_array(self): @@ -212,7 +212,7 @@ cdef class ObjectVector: self.ao = np.empty(_INIT_VEC_CAP, dtype=object) self.data = self.ao.data - def __len__(self): + def __len__(self) -> int: return self.n cdef inline append(self, object obj): @@ -270,7 +270,7 @@ cdef class {{name}}HashTable(HashTable): size_hint = min(size_hint, _SIZE_HINT_LIMIT) kh_resize_{{dtype}}(self.table, size_hint) - def __len__(self): + def __len__(self) -> int: return self.table.size def __dealloc__(self): @@ -897,7 +897,7 @@ cdef class PyObjectHashTable(HashTable): kh_destroy_pymap(self.table) self.table = NULL - def __len__(self): + def __len__(self) -> int: return self.table.size def __contains__(self, object key): diff --git a/pandas/_libs/internals.pyx b/pandas/_libs/internals.pyx index ecd090de500da..08decb44a8a53 100644 --- a/pandas/_libs/internals.pyx +++ b/pandas/_libs/internals.pyx @@ -66,7 +66,7 @@ cdef class BlockPlacement: def __repr__(self) -> str: return str(self) - def __len__(self): + def __len__(self) -> int: cdef: slice s = self._ensure_has_slice() if s is not None: diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 39470c7420086..73d1db9bda8ed 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -1940,7 +1940,7 @@ def take_nd(self, indexer, allow_fill=None, fill_value=None): take = take_nd - def __len__(self): + def __len__(self) -> int: """ The length of this Categorical. """ diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index 4b83dd0cfff09..f93db4695d38f 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -396,7 +396,7 @@ def size(self) -> int: """The number of elements in this array.""" return np.prod(self.shape) - def __len__(self): + def __len__(self) -> int: return len(self._data) def __getitem__(self, key): diff --git a/pandas/core/arrays/integer.py b/pandas/core/arrays/integer.py index 08b53e54b91ef..41d8bffd8c131 100644 --- a/pandas/core/arrays/integer.py +++ b/pandas/core/arrays/integer.py @@ -469,7 +469,7 @@ def __setitem__(self, key, value): self._data[key] = value self._mask[key] = mask - def __len__(self): + def __len__(self) -> int: return len(self._data) @property diff --git a/pandas/core/arrays/interval.py b/pandas/core/arrays/interval.py index cc41797e7872b..cb482665b3534 100644 --- a/pandas/core/arrays/interval.py +++ b/pandas/core/arrays/interval.py @@ -489,7 +489,7 @@ def _validate(self): def __iter__(self): return iter(np.asarray(self)) - def __len__(self): + def __len__(self) -> int: return len(self.left) def __getitem__(self, value): diff --git a/pandas/core/computation/expr.py b/pandas/core/computation/expr.py index 39653c3d695b2..929c9e69d56ac 100644 --- a/pandas/core/computation/expr.py +++ b/pandas/core/computation/expr.py @@ -837,7 +837,7 @@ def __call__(self): def __repr__(self) -> str: return printing.pprint_thing(self.terms) - def __len__(self): + def __len__(self) -> int: return len(self.expr) def parse(self): diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 7e3c2200dbabc..ebee8b10896be 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -1023,7 +1023,7 @@ def itertuples(self, index=True, name="Pandas"): # fallback to regular tuples return zip(*arrays) - def __len__(self): + def __len__(self) -> int: """ Returns length of info axis, but here we use the index. """ diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 49df374670577..2468c43337d0d 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -1952,7 +1952,7 @@ def items(self): def iteritems(self): return self.items() - def __len__(self): + def __len__(self) -> int: """Returns length of info axis""" return len(self._info_axis) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index e73be29d5b104..fd45d60b02277 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -399,7 +399,7 @@ def __init__( # we accept no other args validate_kwargs("group", kwargs, {}) - def __len__(self): + def __len__(self) -> int: return len(self.groups) def __repr__(self) -> str: diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index c9697c530628a..ee124ba3851b1 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -649,10 +649,12 @@ def _engine(self): # Array-Like Methods # ndarray compat - def __len__(self): + def __len__(self) -> int: """ Return the length of the Index. """ + # Assertion needed for mypy, see GH#29475 + assert self._data is not None return len(self._data) def __array__(self, dtype=None): @@ -1807,7 +1809,7 @@ def inferred_type(self): return lib.infer_dtype(self, skipna=False) @cache_readonly - def is_all_dates(self): + def is_all_dates(self) -> bool: return is_datetime_array(ensure_object(self.values)) # -------------------------------------------------------------------- diff --git a/pandas/core/indexes/category.py b/pandas/core/indexes/category.py index e5a8edb56e413..0187b47ab50a1 100644 --- a/pandas/core/indexes/category.py +++ b/pandas/core/indexes/category.py @@ -366,7 +366,7 @@ def _format_attrs(self): # -------------------------------------------------------------------- @property - def inferred_type(self): + def inferred_type(self) -> str: return "categorical" @property diff --git a/pandas/core/indexes/datetimes.py b/pandas/core/indexes/datetimes.py index 2d0ecf1b936da..4a3ee57084a8a 100644 --- a/pandas/core/indexes/datetimes.py +++ b/pandas/core/indexes/datetimes.py @@ -1235,13 +1235,13 @@ def is_type_compatible(self, typ): return typ == self.inferred_type or typ == "datetime" @property - def inferred_type(self): + def inferred_type(self) -> str: # b/c datetime is represented as microseconds since the epoch, make # sure we can't have ambiguous indexing return "datetime64" @property - def is_all_dates(self): + def is_all_dates(self) -> bool: return True def insert(self, loc, item): diff --git a/pandas/core/indexes/interval.py b/pandas/core/indexes/interval.py index bc3c0be08ec12..cf5295460d8fc 100644 --- a/pandas/core/indexes/interval.py +++ b/pandas/core/indexes/interval.py @@ -468,7 +468,7 @@ def itemsize(self): warnings.simplefilter("ignore") return self.left.itemsize + self.right.itemsize - def __len__(self): + def __len__(self) -> int: return len(self.left) @cache_readonly @@ -524,7 +524,7 @@ def dtype(self): return self._data.dtype @property - def inferred_type(self): + def inferred_type(self) -> str: """Return a string of the type inferred from the values""" return "interval" @@ -1357,7 +1357,7 @@ def func(self, other, sort=sort): return func @property - def is_all_dates(self): + def is_all_dates(self) -> bool: """ This is False even when left/right contain datetime-like objects, as the check is done on the Interval itself diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 19769d5b029a1..a6a6de6c13c04 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -1217,7 +1217,7 @@ def format( # -------------------------------------------------------------------- - def __len__(self): + def __len__(self) -> int: return len(self.codes[0]) def _get_names(self): @@ -1322,7 +1322,7 @@ def _constructor(self): return MultiIndex.from_tuples @cache_readonly - def inferred_type(self): + def inferred_type(self) -> str: return "mixed" def _get_level_number(self, level): @@ -1791,7 +1791,7 @@ def to_flat_index(self): return Index(self.values, tupleize_cols=False) @property - def is_all_dates(self): + def is_all_dates(self) -> bool: return False def is_lexsorted(self): diff --git a/pandas/core/indexes/numeric.py b/pandas/core/indexes/numeric.py index 12a9201b06283..3e2b41f62f30b 100644 --- a/pandas/core/indexes/numeric.py +++ b/pandas/core/indexes/numeric.py @@ -134,7 +134,7 @@ def _concat_same_dtype(self, indexes, name): return result.rename(name) @property - def is_all_dates(self): + def is_all_dates(self) -> bool: """ Checks that all the labels are datetime objects. """ @@ -228,7 +228,7 @@ class Int64Index(IntegerIndex): _default_dtype = np.int64 @property - def inferred_type(self): + def inferred_type(self) -> str: """Always 'integer' for ``Int64Index``""" return "integer" @@ -283,7 +283,7 @@ class UInt64Index(IntegerIndex): _default_dtype = np.uint64 @property - def inferred_type(self): + def inferred_type(self) -> str: """Always 'integer' for ``UInt64Index``""" return "integer" @@ -356,7 +356,7 @@ class Float64Index(NumericIndex): _default_dtype = np.float64 @property - def inferred_type(self): + def inferred_type(self) -> str: """Always 'floating' for ``Float64Index``""" return "floating" diff --git a/pandas/core/indexes/period.py b/pandas/core/indexes/period.py index ca7be9ba512da..3bcb9ba345713 100644 --- a/pandas/core/indexes/period.py +++ b/pandas/core/indexes/period.py @@ -574,7 +574,7 @@ def searchsorted(self, value, side="left", sorter=None): return self._ndarray_values.searchsorted(value, side=side, sorter=sorter) @property - def is_all_dates(self): + def is_all_dates(self) -> bool: return True @property @@ -591,7 +591,7 @@ def is_full(self): return ((values[1:] - values[:-1]) < 2).all() @property - def inferred_type(self): + def inferred_type(self) -> str: # b/c data is represented as ints make sure we can't have ambiguous # indexing return "period" diff --git a/pandas/core/indexes/range.py b/pandas/core/indexes/range.py index 5fa3431fc97c0..67791417f1bb5 100644 --- a/pandas/core/indexes/range.py +++ b/pandas/core/indexes/range.py @@ -698,7 +698,7 @@ def _concat_same_dtype(self, indexes, name): # In this case return an empty range index. return RangeIndex(0, 0).rename(name) - def __len__(self): + def __len__(self) -> int: """ return the length of the RangeIndex """ diff --git a/pandas/core/indexes/timedeltas.py b/pandas/core/indexes/timedeltas.py index 2324b8cf74c46..8114b4a772f28 100644 --- a/pandas/core/indexes/timedeltas.py +++ b/pandas/core/indexes/timedeltas.py @@ -602,11 +602,11 @@ def is_type_compatible(self, typ): return typ == self.inferred_type or typ == "timedelta" @property - def inferred_type(self): + def inferred_type(self) -> str: return "timedelta64" @property - def is_all_dates(self): + def is_all_dates(self) -> bool: return True def insert(self, loc, item): diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 9402a3ef9a763..5508cf3ca522e 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -288,7 +288,7 @@ def __repr__(self) -> str: return result - def __len__(self): + def __len__(self) -> int: return len(self.values) def __getstate__(self): diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index 0e97e55acddad..fbe1db1c23cdb 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -319,7 +319,7 @@ def _post_setstate(self): self._known_consolidated = False self._rebuild_blknos_and_blklocs() - def __len__(self): + def __len__(self) -> int: return len(self.items) def __repr__(self) -> str: diff --git a/pandas/core/series.py b/pandas/core/series.py index 15f405e244d0f..7327c2d543836 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -712,7 +712,7 @@ def put(self, *args, **kwargs): ) self._values.put(*args, **kwargs) - def __len__(self): + def __len__(self) -> int: """ Return the length of the Series. """ diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index e98802888e582..ee08e2abb2289 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -540,7 +540,7 @@ def __contains__(self, key): return True return False - def __len__(self): + def __len__(self) -> int: return len(self.groups()) def __repr__(self) -> str: diff --git a/pandas/tests/reshape/test_concat.py b/pandas/tests/reshape/test_concat.py index 5c930e01c735d..b537200dd7664 100644 --- a/pandas/tests/reshape/test_concat.py +++ b/pandas/tests/reshape/test_concat.py @@ -1879,7 +1879,7 @@ def test_concat_iterables(self): tm.assert_frame_equal(concat(deque((df1, df2)), ignore_index=True), expected) class CustomIterator1: - def __len__(self): + def __len__(self) -> int: return 2 def __getitem__(self, index): diff --git a/scripts/validate_docstrings.py b/scripts/validate_docstrings.py index 1d0f4b583bd0c..7c6f2fea97933 100755 --- a/scripts/validate_docstrings.py +++ b/scripts/validate_docstrings.py @@ -250,7 +250,7 @@ def __init__(self, name): self.clean_doc = pydoc.getdoc(obj) self.doc = NumpyDocString(self.clean_doc) - def __len__(self): + def __len__(self) -> int: return len(self.raw_doc) @staticmethod From 8e4424fbcfc98d23085ddbcf8765946bc85da035 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Tue, 12 Nov 2019 06:02:40 -0800 Subject: [PATCH 003/185] TST: parametrize/de-duplicate test_datetime64 (#29559) --- pandas/tests/arithmetic/test_datetime64.py | 190 +++++++-------------- 1 file changed, 57 insertions(+), 133 deletions(-) diff --git a/pandas/tests/arithmetic/test_datetime64.py b/pandas/tests/arithmetic/test_datetime64.py index d239687a37757..4d3d6e2df35db 100644 --- a/pandas/tests/arithmetic/test_datetime64.py +++ b/pandas/tests/arithmetic/test_datetime64.py @@ -26,7 +26,9 @@ Timestamp, date_range, ) +import pandas.core.arrays.datetimelike as dtl from pandas.core.indexes.datetimes import _to_M8 +from pandas.core.ops import roperator import pandas.util.testing as tm @@ -102,19 +104,24 @@ def test_compare_zerodim(self, tz_naive_fixture, box_with_array): expected = tm.box_expected(expected, xbox) tm.assert_equal(result, expected) - def test_dt64arr_cmp_date_invalid(self, tz_naive_fixture, box_with_array): - # GH#19800, GH#19301 datetime.date comparison raises to - # match DatetimeIndex/Timestamp. This also matches the behavior - # of stdlib datetime.datetime - tz = tz_naive_fixture - - dti = pd.date_range("20010101", periods=10, tz=tz) - date = dti[0].to_pydatetime().date() - - dtarr = tm.box_expected(dti, box_with_array) - assert_invalid_comparison(dtarr, date, box_with_array) - - @pytest.mark.parametrize("other", ["foo", -1, 99, 4.0, object(), timedelta(days=2)]) + @pytest.mark.parametrize( + "other", + [ + "foo", + -1, + 99, + 4.0, + object(), + timedelta(days=2), + # GH#19800, GH#19301 datetime.date comparison raises to + # match DatetimeIndex/Timestamp. This also matches the behavior + # of stdlib datetime.datetime + datetime(2001, 1, 1).date(), + # GH#19301 None and NaN are *not* cast to NaT for comparisons + None, + np.nan, + ], + ) def test_dt64arr_cmp_scalar_invalid(self, other, tz_naive_fixture, box_with_array): # GH#22074, GH#15966 tz = tz_naive_fixture @@ -123,16 +130,6 @@ def test_dt64arr_cmp_scalar_invalid(self, other, tz_naive_fixture, box_with_arra dtarr = tm.box_expected(rng, box_with_array) assert_invalid_comparison(dtarr, other, box_with_array) - @pytest.mark.parametrize("other", [None, np.nan]) - def test_dt64arr_cmp_na_scalar_invalid( - self, other, tz_naive_fixture, box_with_array - ): - # GH#19301 - tz = tz_naive_fixture - dti = pd.date_range("2016-01-01", periods=2, tz=tz) - dtarr = tm.box_expected(dti, box_with_array) - assert_invalid_comparison(dtarr, other, box_with_array) - def test_dt64arr_nat_comparison(self, tz_naive_fixture, box_with_array): # GH#22242, GH#22163 DataFrame considered NaT == ts incorrectly tz = tz_naive_fixture @@ -258,15 +255,10 @@ def test_nat_comparisons_scalar(self, dtype, data, box_with_array): tm.assert_equal(left >= NaT, expected) tm.assert_equal(NaT <= left, expected) - def test_series_comparison_scalars(self): + @pytest.mark.parametrize("val", [datetime(2000, 1, 4), datetime(2000, 1, 5)]) + def test_series_comparison_scalars(self, val): series = Series(date_range("1/1/2000", periods=10)) - val = datetime(2000, 1, 4) - result = series > val - expected = Series([x > val for x in series]) - tm.assert_series_equal(result, expected) - - val = series[5] result = series > val expected = Series([x > val for x in series]) tm.assert_series_equal(result, expected) @@ -1020,9 +1012,18 @@ def test_dt64arr_add_timestamp_raises(self, box_with_array): # ------------------------------------------------------------- # Other Invalid Addition/Subtraction - @pytest.mark.parametrize("other", [3.14, np.array([2.0, 3.0])]) - def test_dt64arr_add_sub_float(self, other, box_with_array): - dti = DatetimeIndex(["2011-01-01", "2011-01-02"], freq="D") + @pytest.mark.parametrize( + "other", + [ + 3.14, + np.array([2.0, 3.0]), + # GH#13078 datetime +/- Period is invalid + pd.Period("2011-01-01", freq="D"), + ], + ) + @pytest.mark.parametrize("dti_freq", [None, "D"]) + def test_dt64arr_add_sub_invalid(self, dti_freq, other, box_with_array): + dti = DatetimeIndex(["2011-01-01", "2011-01-02"], freq=dti_freq) dtarr = tm.box_expected(dti, box_with_array) msg = "|".join( [ @@ -1068,24 +1069,6 @@ def test_dt64arr_add_sub_parr( with pytest.raises(TypeError, match=msg): parr - dtarr - @pytest.mark.parametrize("dti_freq", [None, "D"]) - def test_dt64arr_add_sub_period_scalar(self, dti_freq, box_with_array): - # GH#13078 - # not supported, check TypeError - per = pd.Period("2011-01-01", freq="D") - - idx = pd.DatetimeIndex(["2011-01-01", "2011-01-02"], freq=dti_freq) - dtarr = tm.box_expected(idx, box_with_array) - msg = "|".join(["unsupported operand type", "cannot (add|subtract)"]) - with pytest.raises(TypeError, match=msg): - dtarr + per - with pytest.raises(TypeError, match=msg): - per + dtarr - with pytest.raises(TypeError, match=msg): - dtarr - per - with pytest.raises(TypeError, match=msg): - per - dtarr - class TestDatetime64DateOffsetArithmetic: @@ -1406,7 +1389,7 @@ def test_dt64arr_add_mixed_offset_array(self, box_with_array): s = tm.box_expected(s, box_with_array) warn = None if box_with_array is pd.DataFrame else PerformanceWarning - with tm.assert_produces_warning(warn, clear=[pd.core.arrays.datetimelike]): + with tm.assert_produces_warning(warn, clear=[dtl]): other = pd.Index([pd.offsets.DateOffset(years=1), pd.offsets.MonthEnd()]) other = tm.box_expected(other, box_with_array) result = s + other @@ -1435,7 +1418,7 @@ def test_dt64arr_add_sub_offset_ndarray(self, tz_naive_fixture, box_with_array): other = np.array([pd.offsets.MonthEnd(), pd.offsets.Day(n=2)]) warn = None if box_with_array is pd.DataFrame else PerformanceWarning - with tm.assert_produces_warning(warn, clear=[pd.core.arrays.datetimelike]): + with tm.assert_produces_warning(warn, clear=[dtl]): res = dtarr + other expected = DatetimeIndex( [dti[n] + other[n] for n in range(len(dti))], name=dti.name, freq="infer" @@ -1443,11 +1426,11 @@ def test_dt64arr_add_sub_offset_ndarray(self, tz_naive_fixture, box_with_array): expected = tm.box_expected(expected, box_with_array) tm.assert_equal(res, expected) - with tm.assert_produces_warning(warn, clear=[pd.core.arrays.datetimelike]): + with tm.assert_produces_warning(warn, clear=[dtl]): res2 = other + dtarr tm.assert_equal(res2, expected) - with tm.assert_produces_warning(warn, clear=[pd.core.arrays.datetimelike]): + with tm.assert_produces_warning(warn, clear=[dtl]): res = dtarr - other expected = DatetimeIndex( [dti[n] - other[n] for n in range(len(dti))], name=dti.name, freq="infer" @@ -2168,16 +2151,16 @@ def test_dti_isub_tdi(self, tz_naive_fixture): ids=lambda x: type(x).__name__, ) @pytest.mark.parametrize("tz", [None, "US/Eastern"]) - def test_add_datetimelike_and_dti(self, addend, tz): + def test_add_datetimelike_and_dtarr(self, box_with_array, addend, tz): # GH#9631 dti = DatetimeIndex(["2011-01-01", "2011-01-02"]).tz_localize(tz) - msg = ( - "cannot add DatetimeArray and {0}".format(type(addend).__name__) - ).replace("DatetimeIndex", "DatetimeArray") + dtarr = tm.box_expected(dti, box_with_array) + msg = "cannot add DatetimeArray and" + with pytest.raises(TypeError, match=msg): - dti + addend + dtarr + addend with pytest.raises(TypeError, match=msg): - addend + dti + addend + dtarr # ------------------------------------------------------------- @@ -2257,13 +2240,6 @@ def test_timedelta64_equal_timedelta_supported_ops(self, op): intervals = ["D", "h", "m", "s", "us"] - # TODO: unused - # npy16_mappings = {'D': 24 * 60 * 60 * 1000000, - # 'h': 60 * 60 * 1000000, - # 'm': 60 * 1000000, - # 's': 1000000, - # 'us': 1} - def timedelta64(*args): # see casting notes in NumPy gh-12927 return np.sum(list(starmap(np.timedelta64, zip(args, intervals)))) @@ -2406,82 +2382,30 @@ def test_dti_add_series(self, tz, names): result4 = index + ser.values tm.assert_index_equal(result4, expected) + @pytest.mark.parametrize("other_box", [pd.Index, Series]) + @pytest.mark.parametrize("op", [operator.add, roperator.radd, operator.sub]) @pytest.mark.parametrize( "names", [(None, None, None), ("foo", "bar", None), ("foo", "foo", "foo")] ) - def test_dti_add_offset_index(self, tz_naive_fixture, names): + def test_dti_addsub_offset_arraylike(self, tz_naive_fixture, names, op, other_box): # GH#18849, GH#19744 - tz = tz_naive_fixture - dti = pd.date_range("2017-01-01", periods=2, tz=tz, name=names[0]) - other = pd.Index([pd.offsets.MonthEnd(), pd.offsets.Day(n=2)], name=names[1]) - - with tm.assert_produces_warning( - PerformanceWarning, clear=[pd.core.arrays.datetimelike] - ): - res = dti + other - expected = DatetimeIndex( - [dti[n] + other[n] for n in range(len(dti))], name=names[2], freq="infer" - ) - tm.assert_index_equal(res, expected) - - with tm.assert_produces_warning( - PerformanceWarning, clear=[pd.core.arrays.datetimelike] - ): - res2 = other + dti - tm.assert_index_equal(res2, expected) - - @pytest.mark.parametrize( - "names", [(None, None, None), ("foo", "bar", None), ("foo", "foo", "foo")] - ) - def test_dti_sub_offset_index(self, tz_naive_fixture, names): - # GH#18824, GH#19744 - tz = tz_naive_fixture - dti = pd.date_range("2017-01-01", periods=2, tz=tz, name=names[0]) - other = pd.Index([pd.offsets.MonthEnd(), pd.offsets.Day(n=2)], name=names[1]) - - with tm.assert_produces_warning( - PerformanceWarning, clear=[pd.core.arrays.datetimelike] - ): - res = dti - other - expected = DatetimeIndex( - [dti[n] - other[n] for n in range(len(dti))], name=names[2], freq="infer" - ) - tm.assert_index_equal(res, expected) + box = pd.Index + from .test_timedelta64 import get_upcast_box - @pytest.mark.parametrize( - "names", [(None, None, None), ("foo", "bar", None), ("foo", "foo", "foo")] - ) - def test_dti_with_offset_series(self, tz_naive_fixture, names): - # GH#18849 tz = tz_naive_fixture dti = pd.date_range("2017-01-01", periods=2, tz=tz, name=names[0]) - other = Series([pd.offsets.MonthEnd(), pd.offsets.Day(n=2)], name=names[1]) - - expected_add = Series( - [dti[n] + other[n] for n in range(len(dti))], name=names[2] - ) + other = other_box([pd.offsets.MonthEnd(), pd.offsets.Day(n=2)], name=names[1]) - with tm.assert_produces_warning( - PerformanceWarning, clear=[pd.core.arrays.datetimelike] - ): - res = dti + other - tm.assert_series_equal(res, expected_add) + xbox = get_upcast_box(box, other) - with tm.assert_produces_warning( - PerformanceWarning, clear=[pd.core.arrays.datetimelike] - ): - res2 = other + dti - tm.assert_series_equal(res2, expected_add) + with tm.assert_produces_warning(PerformanceWarning, clear=[dtl]): + res = op(dti, other) - expected_sub = Series( - [dti[n] - other[n] for n in range(len(dti))], name=names[2] + expected = DatetimeIndex( + [op(dti[n], other[n]) for n in range(len(dti))], name=names[2], freq="infer" ) - - with tm.assert_produces_warning( - PerformanceWarning, clear=[pd.core.arrays.datetimelike] - ): - res3 = dti - other - tm.assert_series_equal(res3, expected_sub) + expected = tm.box_expected(expected, xbox) + tm.assert_equal(res, expected) @pytest.mark.parametrize("years", [-1, 0, 1]) From b7346ad5b96457edef17357c725200bac4b1b6b5 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Tue, 12 Nov 2019 11:28:22 -0800 Subject: [PATCH 004/185] REF: avoid result=None case in _python_agg_general (#29499) --- pandas/core/groupby/groupby.py | 10 ++++++---- pandas/core/groupby/ops.py | 20 +++++++++++++------- 2 files changed, 19 insertions(+), 11 deletions(-) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index fd45d60b02277..e8d64c7a22bb7 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -898,6 +898,10 @@ def _python_agg_general(self, func, *args, **kwargs): # iterate through "columns" ex exclusions to populate output dict output = {} for name, obj in self._iterate_slices(): + if self.grouper.ngroups == 0: + # agg_series below assumes ngroups > 0 + continue + try: # if this function is invalid for this dtype, we will ignore it. func(obj[:0]) @@ -911,10 +915,8 @@ def _python_agg_general(self, func, *args, **kwargs): pass result, counts = self.grouper.agg_series(obj, f) - if result is not None: - # TODO: only 3 test cases get None here, do something - # in those cases - output[name] = self._try_cast(result, obj, numeric_only=True) + assert result is not None + output[name] = self._try_cast(result, obj, numeric_only=True) if len(output) == 0: return self._python_apply_general(f) diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index fec472f503c9f..6796239cf3fd9 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -601,6 +601,9 @@ def _transform( return result def agg_series(self, obj: Series, func): + # Caller is responsible for checking ngroups != 0 + assert self.ngroups != 0 + if is_extension_array_dtype(obj.dtype) and obj.dtype.kind != "M": # _aggregate_series_fast would raise TypeError when # calling libreduction.Slider @@ -626,8 +629,10 @@ def agg_series(self, obj: Series, func): return self._aggregate_series_pure_python(obj, func) def _aggregate_series_fast(self, obj, func): - # At this point we have already checked that obj.index is not a MultiIndex - # and that obj is backed by an ndarray, not ExtensionArray + # At this point we have already checked that + # - obj.index is not a MultiIndex + # - obj is backed by an ndarray, not ExtensionArray + # - ngroups != 0 func = self._is_builtin_func(func) group_index, _, ngroups = self.group_info @@ -660,11 +665,9 @@ def _aggregate_series_pure_python(self, obj, func): counts[label] = group.shape[0] result[label] = res - if result is not None: - # if splitter is empty, result can be None, in which case - # maybe_convert_objects would raise TypeError - result = lib.maybe_convert_objects(result, try_float=0) - # TODO: try_cast back to EA? + assert result is not None + result = lib.maybe_convert_objects(result, try_float=0) + # TODO: try_cast back to EA? return result, counts @@ -815,6 +818,9 @@ def groupings(self): ] def agg_series(self, obj: Series, func): + # Caller is responsible for checking ngroups != 0 + assert self.ngroups != 0 + if is_extension_array_dtype(obj.dtype): # pre-empty SeriesBinGrouper from raising TypeError # TODO: watch out, this can return None From 37709081c8cc496ca5d90e0cb5df315a7b0d7ec2 Mon Sep 17 00:00:00 2001 From: William Ayd Date: Tue, 12 Nov 2019 11:28:55 -0800 Subject: [PATCH 005/185] Clean Up Case Insensitive Comps in Tokenizer (#29534) --- pandas/_libs/src/parse_helper.h | 10 ------ pandas/_libs/src/parser/tokenizer.c | 54 +++++++++++------------------ pandas/_libs/src/parser/tokenizer.h | 1 + 3 files changed, 22 insertions(+), 43 deletions(-) diff --git a/pandas/_libs/src/parse_helper.h b/pandas/_libs/src/parse_helper.h index 0a767dd27b658..7fbe7a04d5b22 100644 --- a/pandas/_libs/src/parse_helper.h +++ b/pandas/_libs/src/parse_helper.h @@ -11,8 +11,6 @@ The full license is in the LICENSE file, distributed with this software. #define PANDAS__LIBS_SRC_PARSE_HELPER_H_ #include -#include "inline_helper.h" -#include "headers/portable.h" #include "parser/tokenizer.h" int to_double(char *item, double *p_value, char sci, char decimal, @@ -94,12 +92,4 @@ int floatify(PyObject *str, double *result, int *maybe_int) { return -1; } -PANDAS_INLINE void lowercase(char *p) { - for (; *p; ++p) *p = tolower_ascii(*p); -} - -PANDAS_INLINE void uppercase(char *p) { - for (; *p; ++p) *p = toupper_ascii(*p); -} - #endif // PANDAS__LIBS_SRC_PARSE_HELPER_H_ diff --git a/pandas/_libs/src/parser/tokenizer.c b/pandas/_libs/src/parser/tokenizer.c index 2752fb6424022..83869a1d9c342 100644 --- a/pandas/_libs/src/parser/tokenizer.c +++ b/pandas/_libs/src/parser/tokenizer.c @@ -1426,42 +1426,30 @@ int tokenize_all_rows(parser_t *self) { return status; } -PANDAS_INLINE void uppercase(char *p) { - for (; *p; ++p) *p = toupper_ascii(*p); -} - +/* + * Function: to_boolean + * -------------------- + * + * Validate if item should be recognized as a boolean field. + * + * item: const char* representing parsed text + * val : pointer to a uint8_t of boolean representation + * + * If item is determined to be boolean, this method will set + * the appropriate value of val and return 0. A non-zero exit + * status means that item was not inferred to be boolean, and + * leaves the value of *val unmodified. + */ int to_boolean(const char *item, uint8_t *val) { - char *tmp; - int i, status = 0; - size_t length0 = (strlen(item) + 1); - int bufsize = length0; - - static const char *tstrs[1] = {"TRUE"}; - static const char *fstrs[1] = {"FALSE"}; - - tmp = malloc(bufsize); - snprintf(tmp, length0, "%s", item); - uppercase(tmp); - - for (i = 0; i < 1; ++i) { - if (strcmp(tmp, tstrs[i]) == 0) { - *val = 1; - goto done; - } + if (strcasecmp(item, "TRUE") == 0) { + *val = 1; + return 0; + } else if (strcasecmp(item, "FALSE") == 0) { + *val = 0; + return 0; } - for (i = 0; i < 1; ++i) { - if (strcmp(tmp, fstrs[i]) == 0) { - *val = 0; - goto done; - } - } - - status = -1; - -done: - free(tmp); - return status; + return -1; } // --------------------------------------------------------------------------- diff --git a/pandas/_libs/src/parser/tokenizer.h b/pandas/_libs/src/parser/tokenizer.h index 66ef1887d6bc3..802b58d8ec916 100644 --- a/pandas/_libs/src/parser/tokenizer.h +++ b/pandas/_libs/src/parser/tokenizer.h @@ -22,6 +22,7 @@ See LICENSE for the license #include "../headers/stdint.h" #include "../inline_helper.h" +#include "../headers/portable.h" #include "khash.h" From 2ade669c7a3f4ccea4c74c000d2401765e42e78a Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Tue, 12 Nov 2019 11:29:29 -0800 Subject: [PATCH 006/185] TST: cln runtime imports in some tests (#29560) --- pandas/tests/scalar/period/test_period.py | 10 ++++++++++ pandas/tests/scalar/timestamp/test_timestamp.py | 4 ---- pandas/tests/scalar/timestamp/test_timezones.py | 5 ++--- 3 files changed, 12 insertions(+), 7 deletions(-) diff --git a/pandas/tests/scalar/period/test_period.py b/pandas/tests/scalar/period/test_period.py index 3bdf91cbf838b..73371c48f9370 100644 --- a/pandas/tests/scalar/period/test_period.py +++ b/pandas/tests/scalar/period/test_period.py @@ -1044,6 +1044,7 @@ def test_add_sub_nat(self): assert NaT - p is NaT p = Period("NaT", freq="M") + assert p is NaT assert p + NaT is NaT assert NaT + p is NaT assert p - NaT is NaT @@ -1284,6 +1285,7 @@ def test_add_offset_nat(self): # freq is DateOffset for freq in ["A", "2A", "3A"]: p = Period("NaT", freq=freq) + assert p is NaT for o in [offsets.YearEnd(2)]: assert p + o is NaT assert o + p is NaT @@ -1300,6 +1302,7 @@ def test_add_offset_nat(self): for freq in ["M", "2M", "3M"]: p = Period("NaT", freq=freq) + assert p is NaT for o in [offsets.MonthEnd(2), offsets.MonthEnd(12)]: assert p + o is NaT assert o + p is NaT @@ -1317,6 +1320,7 @@ def test_add_offset_nat(self): # freq is Tick for freq in ["D", "2D", "3D"]: p = Period("NaT", freq=freq) + assert p is NaT for o in [ offsets.Day(5), offsets.Hour(24), @@ -1340,6 +1344,7 @@ def test_add_offset_nat(self): for freq in ["H", "2H", "3H"]: p = Period("NaT", freq=freq) + assert p is NaT for o in [ offsets.Day(2), offsets.Hour(3), @@ -1439,6 +1444,7 @@ def test_sub_offset_nat(self): # freq is DateOffset for freq in ["A", "2A", "3A"]: p = Period("NaT", freq=freq) + assert p is NaT for o in [offsets.YearEnd(2)]: assert p - o is NaT @@ -1453,6 +1459,7 @@ def test_sub_offset_nat(self): for freq in ["M", "2M", "3M"]: p = Period("NaT", freq=freq) + assert p is NaT for o in [offsets.MonthEnd(2), offsets.MonthEnd(12)]: assert p - o is NaT @@ -1468,6 +1475,7 @@ def test_sub_offset_nat(self): # freq is Tick for freq in ["D", "2D", "3D"]: p = Period("NaT", freq=freq) + assert p is NaT for o in [ offsets.Day(5), offsets.Hour(24), @@ -1489,6 +1497,7 @@ def test_sub_offset_nat(self): for freq in ["H", "2H", "3H"]: p = Period("NaT", freq=freq) + assert p is NaT for o in [ offsets.Day(2), offsets.Hour(3), @@ -1511,6 +1520,7 @@ def test_sub_offset_nat(self): @pytest.mark.parametrize("freq", ["M", "2M", "3M"]) def test_nat_ops(self, freq): p = Period("NaT", freq=freq) + assert p is NaT assert p + 1 is NaT assert 1 + p is NaT assert p - 1 is NaT diff --git a/pandas/tests/scalar/timestamp/test_timestamp.py b/pandas/tests/scalar/timestamp/test_timestamp.py index 652dd34ca7ce2..f9fa80644d4b9 100644 --- a/pandas/tests/scalar/timestamp/test_timestamp.py +++ b/pandas/tests/scalar/timestamp/test_timestamp.py @@ -202,8 +202,6 @@ def test_constructor(self): base_expected = 1404205200000000000 # confirm base representation is correct - import calendar - assert calendar.timegm(base_dt.timetuple()) * 1000000000 == base_expected tests = [ @@ -275,8 +273,6 @@ def test_constructor_with_stringoffset(self): base_expected = 1404205200000000000 # confirm base representation is correct - import calendar - assert calendar.timegm(base_dt.timetuple()) * 1000000000 == base_expected tests = [ diff --git a/pandas/tests/scalar/timestamp/test_timezones.py b/pandas/tests/scalar/timestamp/test_timezones.py index 424b0c9abdef8..250f48b7e711b 100644 --- a/pandas/tests/scalar/timestamp/test_timezones.py +++ b/pandas/tests/scalar/timestamp/test_timezones.py @@ -306,15 +306,14 @@ def test_astimezone(self, tzstr): @td.skip_if_windows def test_tz_convert_utc_with_system_utc(self): - from pandas._libs.tslibs.timezones import maybe_get_tz # from system utc to real utc - ts = Timestamp("2001-01-05 11:56", tz=maybe_get_tz("dateutil/UTC")) + ts = Timestamp("2001-01-05 11:56", tz=timezones.maybe_get_tz("dateutil/UTC")) # check that the time hasn't changed. assert ts == ts.tz_convert(dateutil.tz.tzutc()) # from system utc to real utc - ts = Timestamp("2001-01-05 11:56", tz=maybe_get_tz("dateutil/UTC")) + ts = Timestamp("2001-01-05 11:56", tz=timezones.maybe_get_tz("dateutil/UTC")) # check that the time hasn't changed. assert ts == ts.tz_convert(dateutil.tz.tzutc()) From f631d0c6430beaa0d1c4e761586e7a2104dd2cfe Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Tue, 12 Nov 2019 15:25:41 -0600 Subject: [PATCH 007/185] DOC: remove okwarnings for io (#29577) Closes https://github.com/pandas-dev/pandas/issues/26843 --- doc/source/user_guide/io.rst | 5 ----- 1 file changed, 5 deletions(-) diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst index f8e174abfd193..6e45d6748c2a5 100644 --- a/doc/source/user_guide/io.rst +++ b/doc/source/user_guide/io.rst @@ -4671,7 +4671,6 @@ See the `Full Documentation `__. Write to a feather file. .. ipython:: python - :okwarning: df.to_feather('example.feather') @@ -4748,7 +4747,6 @@ See the documentation for `pyarrow `__ an Write to a parquet file. .. ipython:: python - :okwarning: df.to_parquet('example_pa.parquet', engine='pyarrow') df.to_parquet('example_fp.parquet', engine='fastparquet') @@ -4765,7 +4763,6 @@ Read from a parquet file. Read only certain columns of a parquet file. .. ipython:: python - :okwarning: result = pd.read_parquet('example_fp.parquet', engine='fastparquet', columns=['a', 'b']) @@ -4788,7 +4785,6 @@ Serializing a ``DataFrame`` to parquet may include the implicit index as one or more columns in the output file. Thus, this code: .. ipython:: python - :okwarning: df = pd.DataFrame({'a': [1, 2], 'b': [3, 4]}) df.to_parquet('test.parquet', engine='pyarrow') @@ -4805,7 +4801,6 @@ If you want to omit a dataframe's indexes when writing, pass ``index=False`` to :func:`~pandas.DataFrame.to_parquet`: .. ipython:: python - :okwarning: df.to_parquet('test.parquet', index=False) From 54f984815960b094a5433b927df87dd2d1063155 Mon Sep 17 00:00:00 2001 From: Louis Huynh <12685195+louishuynh@users.noreply.github.com> Date: Tue, 12 Nov 2019 23:07:54 +0000 Subject: [PATCH 008/185] TST: Add test to check category dtype remains unchanged after concat. (#29352) --- pandas/tests/reshape/test_concat.py | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/pandas/tests/reshape/test_concat.py b/pandas/tests/reshape/test_concat.py index b537200dd7664..46dafbc4e1ec8 100644 --- a/pandas/tests/reshape/test_concat.py +++ b/pandas/tests/reshape/test_concat.py @@ -2747,6 +2747,22 @@ def test_concat_categorical_tz(): tm.assert_series_equal(result, expected) +def test_concat_categorical_unchanged(): + # GH-12007 + # test fix for when concat on categorical and float + # coerces dtype categorical -> float + df = pd.DataFrame(pd.Series(["a", "b", "c"], dtype="category", name="A")) + ser = pd.Series([0, 1, 2], index=[0, 1, 3], name="B") + result = pd.concat([df, ser], axis=1) + expected = pd.DataFrame( + { + "A": pd.Series(["a", "b", "c", np.nan], dtype="category"), + "B": pd.Series([0, 1, np.nan, 2], dtype="float"), + } + ) + tm.assert_equal(result, expected) + + def test_concat_datetimeindex_freq(): # GH 3232 # Monotonic index result From ab9dca0ff230b18432d34b65b5d8e1d75722ce94 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Tue, 12 Nov 2019 15:08:46 -0800 Subject: [PATCH 009/185] REF: Pre-empt ValueError in _aggregate_series_fast (#29500) --- pandas/_libs/reduction.pyx | 9 +++++++-- pandas/core/groupby/ops.py | 12 +++++++----- pandas/tests/groupby/test_bin_groupby.py | 10 ++++++++++ 3 files changed, 24 insertions(+), 7 deletions(-) diff --git a/pandas/_libs/reduction.pyx b/pandas/_libs/reduction.pyx index e6e658c0c6979..a19226670ec0a 100644 --- a/pandas/_libs/reduction.pyx +++ b/pandas/_libs/reduction.pyx @@ -323,6 +323,10 @@ cdef class SeriesGrouper(_BaseGrouper): # safer obj._get_values(slice(None, 0)) assert dummy is not None + if len(series) == 0: + # get_result would never assign `result` + raise ValueError("SeriesGrouper requires non-empty `series`") + self.labels = labels self.f = f @@ -408,8 +412,9 @@ cdef class SeriesGrouper(_BaseGrouper): islider.reset() vslider.reset() - if result is None: - raise ValueError("No result.") + # We check for empty series in the constructor, so should always + # have result initialized by this point. + assert result is not None, "`result` has not been assigned." if result.dtype == np.object_: result = maybe_convert_objects(result) diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index 6796239cf3fd9..ae397277de41c 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -604,7 +604,11 @@ def agg_series(self, obj: Series, func): # Caller is responsible for checking ngroups != 0 assert self.ngroups != 0 - if is_extension_array_dtype(obj.dtype) and obj.dtype.kind != "M": + if len(obj) == 0: + # SeriesGrouper would raise if we were to call _aggregate_series_fast + return self._aggregate_series_pure_python(obj, func) + + elif is_extension_array_dtype(obj.dtype) and obj.dtype.kind != "M": # _aggregate_series_fast would raise TypeError when # calling libreduction.Slider # TODO: can we get a performant workaround for EAs backed by ndarray? @@ -618,10 +622,7 @@ def agg_series(self, obj: Series, func): try: return self._aggregate_series_fast(obj, func) except ValueError as err: - if "No result." in str(err): - # raised in libreduction - pass - elif "Function does not reduce" in str(err): + if "Function does not reduce" in str(err): # raised in libreduction pass else: @@ -632,6 +633,7 @@ def _aggregate_series_fast(self, obj, func): # At this point we have already checked that # - obj.index is not a MultiIndex # - obj is backed by an ndarray, not ExtensionArray + # - len(obj) > 0 # - ngroups != 0 func = self._is_builtin_func(func) diff --git a/pandas/tests/groupby/test_bin_groupby.py b/pandas/tests/groupby/test_bin_groupby.py index 0e7a66769d2d4..4ede6b165c691 100644 --- a/pandas/tests/groupby/test_bin_groupby.py +++ b/pandas/tests/groupby/test_bin_groupby.py @@ -25,6 +25,16 @@ def test_series_grouper(): tm.assert_almost_equal(counts, exp_counts) +def test_series_grouper_requires_nonempty_raises(): + # GH#29500 + obj = Series(np.random.randn(10)) + dummy = obj[:0] + labels = np.array([-1, -1, -1, 0, 0, 0, 1, 1, 1, 1], dtype=np.int64) + + with pytest.raises(ValueError, match="SeriesGrouper requires non-empty `series`"): + libreduction.SeriesGrouper(dummy, np.mean, labels, 2, dummy) + + def test_series_bin_grouper(): obj = Series(np.random.randn(10)) dummy = obj[:0] From 1715468e5a374dbc410e91c0e6631ccb05b8ca61 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Tue, 12 Nov 2019 15:11:05 -0800 Subject: [PATCH 010/185] CLN: raise early instead of try/except (#29581) --- pandas/core/groupby/groupby.py | 22 ++++++---------------- 1 file changed, 6 insertions(+), 16 deletions(-) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index e8d64c7a22bb7..f9f20d13f9585 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -636,24 +636,14 @@ def curried(x): # TODO: is the above comment accurate? raise - # related to : GH3688 - # try item-by-item - # this can be called recursively, so need to raise - # ValueError - # if we don't have this method to indicated to aggregate to - # mark this column as an error - try: - result = self._aggregate_item_by_item(name, *args, **kwargs) - assert self.obj.ndim == 2 - return result - except AttributeError: - # e.g. SparseArray has no flags attr - # FIXME: 'SeriesGroupBy' has no attribute '_aggregate_item_by_item' - # occurs in idxmax() case - # in tests.groupby.test_function.test_non_cython_api - assert self.obj.ndim == 1 + if self.obj.ndim == 1: + # this can be called recursively, so need to raise ValueError raise ValueError + # GH#3688 try to operate item-by-item + result = self._aggregate_item_by_item(name, *args, **kwargs) + return result + wrapper.__name__ = name return wrapper From fe143e57b7db3f148a55119b77c7b84984402144 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Tue, 12 Nov 2019 15:12:08 -0800 Subject: [PATCH 011/185] CLN: pre-empt NotImplementedErorr in _aggregate_multiple_funcs (#29582) --- pandas/core/groupby/generic.py | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 56a8a7d15077b..569f57346be4e 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -888,6 +888,11 @@ def aggregate(self, func=None, *args, **kwargs): return self._python_agg_general(func, *args, **kwargs) elif args or kwargs: result = self._aggregate_frame(func, *args, **kwargs) + + elif self.axis == 1: + # _aggregate_multiple_funcs does not allow self.axis == 1 + result = self._aggregate_frame(func) + else: # try to treat as if we are passing a list @@ -901,17 +906,11 @@ def aggregate(self, func=None, *args, **kwargs): raise result = self._aggregate_frame(func) except NotImplementedError as err: - if "axis other than 0 is not supported" in str(err): - # raised directly by _aggregate_multiple_funcs - pass - elif "decimal does not support skipna=True" in str(err): + if "decimal does not support skipna=True" in str(err): # FIXME: kludge for DecimalArray tests pass else: raise - # FIXME: this is raised in a bunch of - # test_whitelist.test_regression_whitelist_methods tests, - # can be avoided result = self._aggregate_frame(func) else: result.columns = Index( From 5b580fb72fecb1508f8264b1cc451e63547cc26a Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Tue, 12 Nov 2019 15:13:44 -0800 Subject: [PATCH 012/185] CLN: annotate ndim, is_monotonic_decreasing, has_duplicates, is_, nlevels (#29561) --- pandas/_libs/indexing.pyx | 2 +- pandas/core/base.py | 6 +++--- pandas/core/computation/ops.py | 2 +- pandas/core/generic.py | 2 +- pandas/core/indexes/base.py | 8 ++++---- pandas/core/indexes/category.py | 2 +- pandas/core/indexes/interval.py | 2 +- pandas/core/indexes/multi.py | 4 ++-- pandas/core/indexes/range.py | 4 ++-- pandas/core/internals/managers.py | 2 +- 10 files changed, 17 insertions(+), 17 deletions(-) diff --git a/pandas/_libs/indexing.pyx b/pandas/_libs/indexing.pyx index 308e914b7b5b7..7a25a52c7e608 100644 --- a/pandas/_libs/indexing.pyx +++ b/pandas/_libs/indexing.pyx @@ -11,7 +11,7 @@ cdef class _NDFrameIndexerBase: self._ndim = None @property - def ndim(self): + def ndim(self) -> int: # Delay `ndim` instantiation until required as reading it # from `obj` isn't entirely cheap. ndim = self._ndim diff --git a/pandas/core/base.py b/pandas/core/base.py index 10e7b5d186bba..e070005c56d7a 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -686,7 +686,7 @@ def transpose(self, *args, **kwargs): ) @property - def _is_homogeneous_type(self): + def _is_homogeneous_type(self) -> bool: """ Whether the object has a single dtype. @@ -711,7 +711,7 @@ def shape(self): return self._values.shape @property - def ndim(self): + def ndim(self) -> int: """ Number of dimensions of the underlying data, by definition 1. """ @@ -1467,7 +1467,7 @@ def is_monotonic(self): is_monotonic_increasing = is_monotonic @property - def is_monotonic_decreasing(self): + def is_monotonic_decreasing(self) -> bool: """ Return boolean if values in the object are monotonic_decreasing. diff --git a/pandas/core/computation/ops.py b/pandas/core/computation/ops.py index fe74b6994be7c..8fab5bd87d4fe 100644 --- a/pandas/core/computation/ops.py +++ b/pandas/core/computation/ops.py @@ -167,7 +167,7 @@ def name(self): return self._name @property - def ndim(self): + def ndim(self) -> int: return self._value.ndim diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 2468c43337d0d..c8ce7561a12b8 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -569,7 +569,7 @@ def axes(self): return [self._get_axis(a) for a in self._AXIS_ORDERS] @property - def ndim(self): + def ndim(self) -> int: """ Return an int representing the number of axes / array dimensions. diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index ee124ba3851b1..9748342c86f31 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -607,7 +607,7 @@ def _update_inplace(self, result, **kwargs): # guard when called from IndexOpsMixin raise TypeError("Index can't be updated inplace") - def is_(self, other): + def is_(self, other) -> bool: """ More flexible, faster check like ``is`` but that works through views. @@ -1452,7 +1452,7 @@ def rename(self, name, inplace=False): # Level-Centric Methods @property - def nlevels(self): + def nlevels(self) -> int: """ Number of levels. """ @@ -1677,7 +1677,7 @@ def is_monotonic_increasing(self): return self._engine.is_monotonic_increasing @property - def is_monotonic_decreasing(self): + def is_monotonic_decreasing(self) -> bool: """ Return if the index is monotonic decreasing (only equal or decreasing) values. @@ -1735,7 +1735,7 @@ def is_unique(self): return self._engine.is_unique @property - def has_duplicates(self): + def has_duplicates(self) -> bool: return not self.is_unique def is_boolean(self): diff --git a/pandas/core/indexes/category.py b/pandas/core/indexes/category.py index 0187b47ab50a1..49bb705e09469 100644 --- a/pandas/core/indexes/category.py +++ b/pandas/core/indexes/category.py @@ -463,7 +463,7 @@ def is_monotonic_increasing(self): return self._engine.is_monotonic_increasing @property - def is_monotonic_decreasing(self): + def is_monotonic_decreasing(self) -> bool: return self._engine.is_monotonic_decreasing @Appender(_index_shared_docs["index_unique"] % _index_doc_kwargs) diff --git a/pandas/core/indexes/interval.py b/pandas/core/indexes/interval.py index cf5295460d8fc..1c4addfb44839 100644 --- a/pandas/core/indexes/interval.py +++ b/pandas/core/indexes/interval.py @@ -558,7 +558,7 @@ def is_monotonic_increasing(self): return self._engine.is_monotonic_increasing @cache_readonly - def is_monotonic_decreasing(self): + def is_monotonic_decreasing(self) -> bool: """ Return True if the IntervalIndex is monotonic decreasing (only equal or decreasing values), else False diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index a6a6de6c13c04..a83fd6bf59f05 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -675,7 +675,7 @@ def array(self): raise ValueError(msg) @property - def _is_homogeneous_type(self): + def _is_homogeneous_type(self) -> bool: """Whether the levels of a MultiIndex all have the same dtype. This looks at the dtypes of the levels. @@ -1427,7 +1427,7 @@ def is_monotonic_increasing(self): return Index(self.values).is_monotonic @cache_readonly - def is_monotonic_decreasing(self): + def is_monotonic_decreasing(self) -> bool: """ return if the index is monotonic decreasing (only equal or decreasing) values. diff --git a/pandas/core/indexes/range.py b/pandas/core/indexes/range.py index 67791417f1bb5..962ba8cc00557 100644 --- a/pandas/core/indexes/range.py +++ b/pandas/core/indexes/range.py @@ -353,11 +353,11 @@ def is_monotonic_increasing(self): return self._range.step > 0 or len(self) <= 1 @cache_readonly - def is_monotonic_decreasing(self): + def is_monotonic_decreasing(self) -> bool: return self._range.step < 0 or len(self) <= 1 @property - def has_duplicates(self): + def has_duplicates(self) -> bool: return False def __contains__(self, key: Union[int, np.integer]) -> bool: diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index fbe1db1c23cdb..6408da37d4343 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -166,7 +166,7 @@ def shape(self): return tuple(len(ax) for ax in self.axes) @property - def ndim(self): + def ndim(self) -> int: return len(self.axes) def set_axis(self, axis, new_labels): From 82c9547ddcaf2fd70e00f1368731f14a03bbac88 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Tue, 12 Nov 2019 15:14:39 -0800 Subject: [PATCH 013/185] BUG: np.datetime64 - TimedeltaArray (#29558) --- doc/source/whatsnew/v1.0.0.rst | 2 +- pandas/core/arrays/datetimelike.py | 3 +++ pandas/tests/arithmetic/test_timedelta64.py | 15 ++++++++++----- 3 files changed, 14 insertions(+), 6 deletions(-) diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index cd012fe755337..5a5584f89d3a1 100644 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -318,7 +318,7 @@ Datetimelike Timedelta ^^^^^^^^^ - +- Bug in subtracting a :class:`TimedeltaIndex` or :class:`TimedeltaArray` from a ``np.datetime64`` object (:issue:`29558`) - - diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index f93db4695d38f..497f33f0f4704 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -1303,6 +1303,9 @@ def __rsub__(self, other): if is_datetime64_any_dtype(other) and is_timedelta64_dtype(self.dtype): # ndarray[datetime64] cannot be subtracted from self, so # we need to wrap in DatetimeArray/Index and flip the operation + if lib.is_scalar(other): + # i.e. np.datetime64 object + return Timestamp(other) - self if not isinstance(other, DatetimeLikeArrayMixin): # Avoid down-casting DatetimeIndex from pandas.core.arrays import DatetimeArray diff --git a/pandas/tests/arithmetic/test_timedelta64.py b/pandas/tests/arithmetic/test_timedelta64.py index ecb07fa49036a..d45daf9ab8433 100644 --- a/pandas/tests/arithmetic/test_timedelta64.py +++ b/pandas/tests/arithmetic/test_timedelta64.py @@ -896,11 +896,16 @@ def test_td64arr_add_timestamp(self, box_with_array, tz_naive_fixture): result = other + idx tm.assert_equal(result, expected) - def test_td64arr_add_sub_timestamp(self, box_with_array): - # GH#11925 - ts = Timestamp("2012-01-01") - # TODO: parametrize over types of datetime scalar? - + @pytest.mark.parametrize( + "ts", + [ + Timestamp("2012-01-01"), + Timestamp("2012-01-01").to_pydatetime(), + Timestamp("2012-01-01").to_datetime64(), + ], + ) + def test_td64arr_add_sub_datetimelike_scalar(self, ts, box_with_array): + # GH#11925, GH#29558 tdi = timedelta_range("1 day", periods=3) expected = pd.date_range("2012-01-02", periods=3) From 794a1c21cfcbadd7a36653d9c8184868442be35b Mon Sep 17 00:00:00 2001 From: MomIsBestFriend <50263213+MomIsBestFriend@users.noreply.github.com> Date: Wed, 13 Nov 2019 01:16:21 +0200 Subject: [PATCH 014/185] CLN:F-string formatting (#29554) --- pandas/_libs/parsers.pyx | 67 +++++++++++++++++----------------------- pandas/_libs/testing.pyx | 8 ++--- pandas/_libs/tslib.pyx | 2 +- 3 files changed, 33 insertions(+), 44 deletions(-) diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx index 601b81556be0e..a2c7d0da5b4a8 100644 --- a/pandas/_libs/parsers.pyx +++ b/pandas/_libs/parsers.pyx @@ -589,8 +589,7 @@ cdef class TextReader: if not isinstance(quote_char, (str, bytes)) and quote_char is not None: dtype = type(quote_char).__name__ - raise TypeError('"quotechar" must be string, ' - 'not {dtype}'.format(dtype=dtype)) + raise TypeError(f'"quotechar" must be string, not {dtype}') if quote_char is None or quote_char == '': if quoting != QUOTE_NONE: @@ -685,7 +684,7 @@ cdef class TextReader: if not os.path.exists(source): raise FileNotFoundError( ENOENT, - 'File {source} does not exist'.format(source=source), + f'File {source} does not exist', source) raise IOError('Initializing from file failed') @@ -741,8 +740,8 @@ cdef class TextReader: self.parser.lines < hr): msg = self.orig_header if isinstance(msg, list): - msg = "[%s], len of %d," % ( - ','.join(str(m) for m in msg), len(msg)) + joined = ','.join(str(m) for m in msg) + msg = f"[{joined}], len of {len(msg)}," raise ParserError( f'Passed header={msg} but only ' f'{self.parser.lines} lines in file') @@ -768,10 +767,9 @@ cdef class TextReader: if name == '': if self.has_mi_columns: - name = ('Unnamed: {i}_level_{lvl}' - .format(i=i, lvl=level)) + name = f'Unnamed: {i}_level_{level}' else: - name = 'Unnamed: {i}'.format(i=i) + name = f'Unnamed: {i}' unnamed_count += 1 count = counts.get(name, 0) @@ -990,7 +988,7 @@ cdef class TextReader: cdef _end_clock(self, what): if self.verbose: elapsed = time.time() - self.clocks.pop(-1) - print('%s took: %.2f ms' % (what, elapsed * 1000)) + print(f'{what} took: {elapsed * 1000:.2f} ms') def set_noconvert(self, i): self.noconvert.add(i) @@ -1028,11 +1026,9 @@ cdef class TextReader: (num_cols >= self.parser.line_fields[i]) * num_cols if self.table_width - self.leading_cols > num_cols: - raise ParserError( - "Too many columns specified: expected {expected} and " - "found {found}" - .format(expected=self.table_width - self.leading_cols, - found=num_cols)) + raise ParserError(f"Too many columns specified: expected " + f"{self.table_width - self.leading_cols} " + f"and found {num_cols}") results = {} nused = 0 @@ -1075,9 +1071,9 @@ cdef class TextReader: if conv: if col_dtype is not None: - warnings.warn(("Both a converter and dtype were specified " - "for column {0} - only the converter will " - "be used").format(name), ParserWarning, + warnings.warn((f"Both a converter and dtype were specified " + f"for column {name} - only the converter will " + f"be used"), ParserWarning, stacklevel=5) results[i] = _apply_converter(conv, self.parser, i, start, end, self.c_encoding) @@ -1118,7 +1114,7 @@ cdef class TextReader: col_res = _maybe_upcast(col_res) if col_res is None: - raise ParserError('Unable to parse column {i}'.format(i=i)) + raise ParserError(f'Unable to parse column {i}') results[i] = col_res @@ -1178,12 +1174,9 @@ cdef class TextReader: col_res = col_res.astype(col_dtype) if (col_res != col_res_orig).any(): raise ValueError( - "cannot safely convert passed user dtype of " - "{col_dtype} for {col_res} dtyped data in " - "column {column}".format( - col_dtype=col_dtype, - col_res=col_res_orig.dtype.name, - column=i)) + f"cannot safely convert passed user dtype of " + f"{col_dtype} for {col_res_orig.dtype.name} dtyped data in " + f"column {i}") return col_res, na_count @@ -1216,9 +1209,9 @@ cdef class TextReader: dtype=dtype) except NotImplementedError: raise NotImplementedError( - "Extension Array: {ea} must implement " - "_from_sequence_of_strings in order " - "to be used in parser methods".format(ea=array_type)) + f"Extension Array: {array_type} must implement " + f"_from_sequence_of_strings in order " + f"to be used in parser methods") return result, na_count @@ -1228,8 +1221,7 @@ cdef class TextReader: end, na_filter, na_hashset) if user_dtype and na_count is not None: if na_count > 0: - raise ValueError("Integer column has NA values in " - "column {column}".format(column=i)) + raise ValueError(f"Integer column has NA values in column {i}") except OverflowError: result = _try_uint64(self.parser, i, start, end, na_filter, na_hashset) @@ -1253,8 +1245,7 @@ cdef class TextReader: self.true_set, self.false_set) if user_dtype and na_count is not None: if na_count > 0: - raise ValueError("Bool column has NA values in " - "column {column}".format(column=i)) + raise ValueError(f"Bool column has NA values in column {i}") return result, na_count elif dtype.kind == 'S': @@ -1270,8 +1261,7 @@ cdef class TextReader: elif dtype.kind == 'U': width = dtype.itemsize if width > 0: - raise TypeError("the dtype {dtype} is not " - "supported for parsing".format(dtype=dtype)) + raise TypeError(f"the dtype {dtype} is not supported for parsing") # unicode variable width return self._string_convert(i, start, end, na_filter, @@ -1280,12 +1270,11 @@ cdef class TextReader: return self._string_convert(i, start, end, na_filter, na_hashset) elif is_datetime64_dtype(dtype): - raise TypeError("the dtype {dtype} is not supported " - "for parsing, pass this column " - "using parse_dates instead".format(dtype=dtype)) + raise TypeError(f"the dtype {dtype} is not supported " + f"for parsing, pass this column " + f"using parse_dates instead") else: - raise TypeError("the dtype {dtype} is not " - "supported for parsing".format(dtype=dtype)) + raise TypeError(f"the dtype {dtype} is not supported for parsing") cdef _string_convert(self, Py_ssize_t i, int64_t start, int64_t end, bint na_filter, kh_str_starts_t *na_hashset): @@ -2132,7 +2121,7 @@ cdef raise_parser_error(object base, parser_t *parser): Py_XDECREF(type) raise old_exc - message = '{base}. C error: '.format(base=base) + message = f'{base}. C error: ' if parser.error_msg != NULL: message += parser.error_msg.decode('utf-8') else: diff --git a/pandas/_libs/testing.pyx b/pandas/_libs/testing.pyx index f848310d961e1..141735a97938a 100644 --- a/pandas/_libs/testing.pyx +++ b/pandas/_libs/testing.pyx @@ -204,12 +204,12 @@ cpdef assert_almost_equal(a, b, # case for zero if abs(fa) < 1e-5: if not decimal_almost_equal(fa, fb, decimal): - assert False, ('(very low values) expected %.5f but ' - 'got %.5f, with decimal %d' % (fb, fa, decimal)) + assert False, (f'(very low values) expected {fb:.5f} ' + f'but got {fa:.5f}, with decimal {decimal}') else: if not decimal_almost_equal(1, fb / fa, decimal): - assert False, ('expected %.5f but got %.5f, ' - 'with decimal %d' % (fb, fa, decimal)) + assert False, (f'expected {fb:.5f} but got {fa:.5f}, ' + f'with decimal {decimal}') return True raise AssertionError(f"{a} != {b}") diff --git a/pandas/_libs/tslib.pyx b/pandas/_libs/tslib.pyx index d101a2976cd55..01d90900cd604 100644 --- a/pandas/_libs/tslib.pyx +++ b/pandas/_libs/tslib.pyx @@ -279,7 +279,7 @@ def format_array_from_datetime(ndarray[int64_t] values, object tz=None, elif show_us: res += '.%.6d' % dts.us elif show_ms: - res += '.%.3d' % (dts.us /1000) + res += '.%.3d' % (dts.us / 1000) result[i] = res From 112e6b8d054f9adc1303138533ed6506975f94db Mon Sep 17 00:00:00 2001 From: Gyeongjae Choi Date: Wed, 13 Nov 2019 08:17:46 +0900 Subject: [PATCH 015/185] BUG: GH29595 fix read_json() to use utf-8 for a default encoding (#29566) --- doc/source/whatsnew/v1.0.0.rst | 2 ++ pandas/io/json/_json.py | 2 ++ pandas/tests/io/json/test_readlines.py | 11 +++++++++++ 3 files changed, 15 insertions(+) diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index 5a5584f89d3a1..93f59f6a6a614 100644 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -400,6 +400,8 @@ I/O - Bug in :meth:`DataFrame.to_html` when using ``formatters=`` and ``max_cols`` together. (:issue:`25955`) - Bug in :meth:`Styler.background_gradient` not able to work with dtype ``Int64`` (:issue:`28869`) - Bug in :meth:`DataFrame.to_clipboard` which did not work reliably in ipython (:issue:`22707`) +- Bug in :func:`read_json` where default encoding was not set to ``utf-8`` (:issue:`29565`) +- Plotting ^^^^^^^^ diff --git a/pandas/io/json/_json.py b/pandas/io/json/_json.py index 8615355996031..0a8f275cf54a9 100644 --- a/pandas/io/json/_json.py +++ b/pandas/io/json/_json.py @@ -577,6 +577,8 @@ def read_json( dtype = True if convert_axes is None and orient != "table": convert_axes = True + if encoding is None: + encoding = "utf-8" compression = _infer_compression(path_or_buf, compression) filepath_or_buffer, _, compression, should_close = get_filepath_or_buffer( diff --git a/pandas/tests/io/json/test_readlines.py b/pandas/tests/io/json/test_readlines.py index 05f97a1769205..c4e03e24a7495 100644 --- a/pandas/tests/io/json/test_readlines.py +++ b/pandas/tests/io/json/test_readlines.py @@ -173,3 +173,14 @@ def test_readjson_chunks_multiple_empty_lines(chunksize): tm.assert_frame_equal( orig, test, obj="chunksize: {chunksize}".format(chunksize=chunksize) ) + + +def test_readjson_unicode(monkeypatch): + with tm.ensure_clean("test.json") as path: + monkeypatch.setattr("_bootlocale.getpreferredencoding", lambda l: "cp949") + with open(path, "w", encoding="utf-8") as f: + f.write('{"£©µÀÆÖÞßéöÿ":["АБВГДабвгд가"]}') + + result = read_json(path) + expected = pd.DataFrame({"£©µÀÆÖÞßéöÿ": ["АБВГДабвгд가"]}) + tm.assert_frame_equal(result, expected) From 69856e3d7689713c672960de73e7a5457d6253b5 Mon Sep 17 00:00:00 2001 From: Terji Petersen Date: Tue, 12 Nov 2019 23:19:38 +0000 Subject: [PATCH 016/185] REF: rename labels to codes in safe_sort and _factorize (#29552) --- pandas/core/algorithms.py | 75 ++++++++++++++++++------------------ pandas/tests/test_sorting.py | 72 +++++++++++++++++----------------- 2 files changed, 74 insertions(+), 73 deletions(-) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 23675752a4593..b49a9d7957d51 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -448,9 +448,11 @@ def isin(comps, values) -> np.ndarray: return f(comps, values) -def _factorize_array(values, na_sentinel: int = -1, size_hint=None, na_value=None): +def _factorize_array( + values, na_sentinel: int = -1, size_hint=None, na_value=None +) -> Tuple[np.ndarray, np.ndarray]: """ - Factorize an array-like to labels and uniques. + Factorize an array-like to codes and uniques. This doesn't do any coercion of types or unboxing before factorization. @@ -468,18 +470,16 @@ def _factorize_array(values, na_sentinel: int = -1, size_hint=None, na_value=Non Returns ------- - labels : ndarray + codes : ndarray uniques : ndarray """ hash_klass, values = _get_data_algo(values) table = hash_klass(size_hint or len(values)) - uniques, labels = table.factorize( - values, na_sentinel=na_sentinel, na_value=na_value - ) + uniques, codes = table.factorize(values, na_sentinel=na_sentinel, na_value=na_value) - labels = ensure_platform_int(labels) - return labels, uniques + codes = ensure_platform_int(codes) + return codes, uniques _shared_docs[ @@ -1924,33 +1924,34 @@ def diff(arr, n: int, axis: int = 0): # this module. def safe_sort( values, - labels=None, + codes=None, na_sentinel: int = -1, assume_unique: bool = False, verify: bool = True, -): +) -> Union[np.ndarray, Tuple[np.ndarray, np.ndarray]]: """ - Sort ``values`` and reorder corresponding ``labels``. - ``values`` should be unique if ``labels`` is not None. + Sort ``values`` and reorder corresponding ``codes``. + + ``values`` should be unique if ``codes`` is not None. Safe for use with mixed types (int, str), orders ints before strs. Parameters ---------- values : list-like - Sequence; must be unique if ``labels`` is not None. - labels : list_like + Sequence; must be unique if ``codes`` is not None. + codes : list_like, optional Indices to ``values``. All out of bound indices are treated as "not found" and will be masked with ``na_sentinel``. na_sentinel : int, default -1 - Value in ``labels`` to mark "not found". - Ignored when ``labels`` is None. + Value in ``codes`` to mark "not found". + Ignored when ``codes`` is None. assume_unique : bool, default False When True, ``values`` are assumed to be unique, which can speed up - the calculation. Ignored when ``labels`` is None. + the calculation. Ignored when ``codes`` is None. verify : bool, default True - Check if labels are out of bound for the values and put out of bound - labels equal to na_sentinel. If ``verify=False``, it is assumed there - are no out of bound labels. Ignored when ``labels`` is None. + Check if codes are out of bound for the values and put out of bound + codes equal to na_sentinel. If ``verify=False``, it is assumed there + are no out of bound codes. Ignored when ``codes`` is None. .. versionadded:: 0.25.0 @@ -1958,17 +1959,17 @@ def safe_sort( ------- ordered : ndarray Sorted ``values`` - new_labels : ndarray - Reordered ``labels``; returned when ``labels`` is not None. + new_codes : ndarray + Reordered ``codes``; returned when ``codes`` is not None. Raises ------ TypeError - * If ``values`` is not list-like or if ``labels`` is neither None + * If ``values`` is not list-like or if ``codes`` is neither None nor list-like * If ``values`` cannot be sorted ValueError - * If ``labels`` is not None and ``values`` contain duplicates. + * If ``codes`` is not None and ``values`` contain duplicates. """ if not is_list_like(values): raise TypeError( @@ -2002,22 +2003,22 @@ def sort_mixed(values): # try this anyway ordered = sort_mixed(values) - # labels: + # codes: - if labels is None: + if codes is None: return ordered - if not is_list_like(labels): + if not is_list_like(codes): raise TypeError( "Only list-like objects or None are allowed to be" - "passed to safe_sort as labels" + "passed to safe_sort as codes" ) - labels = ensure_platform_int(np.asarray(labels)) + codes = ensure_platform_int(np.asarray(codes)) from pandas import Index if not assume_unique and not Index(values).is_unique: - raise ValueError("values should be unique if labels is not None") + raise ValueError("values should be unique if codes is not None") if sorter is None: # mixed types @@ -2029,9 +2030,9 @@ def sort_mixed(values): if na_sentinel == -1: # take_1d is faster, but only works for na_sentinels of -1 order2 = sorter.argsort() - new_labels = take_1d(order2, labels, fill_value=-1) + new_codes = take_1d(order2, codes, fill_value=-1) if verify: - mask = (labels < -len(values)) | (labels >= len(values)) + mask = (codes < -len(values)) | (codes >= len(values)) else: mask = None else: @@ -2039,13 +2040,13 @@ def sort_mixed(values): reverse_indexer.put(sorter, np.arange(len(sorter))) # Out of bound indices will be masked with `na_sentinel` next, so we # may deal with them here without performance loss using `mode='wrap'` - new_labels = reverse_indexer.take(labels, mode="wrap") + new_codes = reverse_indexer.take(codes, mode="wrap") - mask = labels == na_sentinel + mask = codes == na_sentinel if verify: - mask = mask | (labels < -len(values)) | (labels >= len(values)) + mask = mask | (codes < -len(values)) | (codes >= len(values)) if mask is not None: - np.putmask(new_labels, mask, na_sentinel) + np.putmask(new_codes, mask, na_sentinel) - return ordered, ensure_platform_int(new_labels) + return ordered, ensure_platform_int(new_codes) diff --git a/pandas/tests/test_sorting.py b/pandas/tests/test_sorting.py index 5d7eb70817a11..90cd9cc3e006d 100644 --- a/pandas/tests/test_sorting.py +++ b/pandas/tests/test_sorting.py @@ -314,27 +314,27 @@ def verify_order(df): def test_decons(): - def testit(label_list, shape): - group_index = get_group_index(label_list, shape, sort=True, xnull=True) - label_list2 = decons_group_index(group_index, shape) + def testit(codes_list, shape): + group_index = get_group_index(codes_list, shape, sort=True, xnull=True) + codes_list2 = decons_group_index(group_index, shape) - for a, b in zip(label_list, label_list2): + for a, b in zip(codes_list, codes_list2): tm.assert_numpy_array_equal(a, b) shape = (4, 5, 6) - label_list = [ + codes_list = [ np.tile([0, 1, 2, 3, 0, 1, 2, 3], 100).astype(np.int64), np.tile([0, 2, 4, 3, 0, 1, 2, 3], 100).astype(np.int64), np.tile([5, 1, 0, 2, 3, 0, 5, 4], 100).astype(np.int64), ] - testit(label_list, shape) + testit(codes_list, shape) shape = (10000, 10000) - label_list = [ + codes_list = [ np.tile(np.arange(10000, dtype=np.int64), 5), np.tile(np.arange(10000, dtype=np.int64), 5), ] - testit(label_list, shape) + testit(codes_list, shape) class TestSafeSort: @@ -355,42 +355,42 @@ def test_basic_sort(self): tm.assert_numpy_array_equal(result, expected) @pytest.mark.parametrize("verify", [True, False]) - def test_labels(self, verify): + def test_codes(self, verify): values = [3, 1, 2, 0, 4] expected = np.array([0, 1, 2, 3, 4]) - labels = [0, 1, 1, 2, 3, 0, -1, 4] - result, result_labels = safe_sort(values, labels, verify=verify) - expected_labels = np.array([3, 1, 1, 2, 0, 3, -1, 4], dtype=np.intp) + codes = [0, 1, 1, 2, 3, 0, -1, 4] + result, result_codes = safe_sort(values, codes, verify=verify) + expected_codes = np.array([3, 1, 1, 2, 0, 3, -1, 4], dtype=np.intp) tm.assert_numpy_array_equal(result, expected) - tm.assert_numpy_array_equal(result_labels, expected_labels) + tm.assert_numpy_array_equal(result_codes, expected_codes) # na_sentinel - labels = [0, 1, 1, 2, 3, 0, 99, 4] - result, result_labels = safe_sort(values, labels, na_sentinel=99, verify=verify) - expected_labels = np.array([3, 1, 1, 2, 0, 3, 99, 4], dtype=np.intp) + codes = [0, 1, 1, 2, 3, 0, 99, 4] + result, result_codes = safe_sort(values, codes, na_sentinel=99, verify=verify) + expected_codes = np.array([3, 1, 1, 2, 0, 3, 99, 4], dtype=np.intp) tm.assert_numpy_array_equal(result, expected) - tm.assert_numpy_array_equal(result_labels, expected_labels) + tm.assert_numpy_array_equal(result_codes, expected_codes) - labels = [] - result, result_labels = safe_sort(values, labels, verify=verify) - expected_labels = np.array([], dtype=np.intp) + codes = [] + result, result_codes = safe_sort(values, codes, verify=verify) + expected_codes = np.array([], dtype=np.intp) tm.assert_numpy_array_equal(result, expected) - tm.assert_numpy_array_equal(result_labels, expected_labels) + tm.assert_numpy_array_equal(result_codes, expected_codes) @pytest.mark.parametrize("na_sentinel", [-1, 99]) - def test_labels_out_of_bound(self, na_sentinel): + def test_codes_out_of_bound(self, na_sentinel): values = [3, 1, 2, 0, 4] expected = np.array([0, 1, 2, 3, 4]) # out of bound indices - labels = [0, 101, 102, 2, 3, 0, 99, 4] - result, result_labels = safe_sort(values, labels, na_sentinel=na_sentinel) - expected_labels = np.array( + codes = [0, 101, 102, 2, 3, 0, 99, 4] + result, result_codes = safe_sort(values, codes, na_sentinel=na_sentinel) + expected_codes = np.array( [3, na_sentinel, na_sentinel, 2, 0, 3, na_sentinel, 4], dtype=np.intp ) tm.assert_numpy_array_equal(result, expected) - tm.assert_numpy_array_equal(result_labels, expected_labels) + tm.assert_numpy_array_equal(result_codes, expected_codes) def test_mixed_integer(self): values = np.array(["b", 1, 0, "a", 0, "b"], dtype=object) @@ -399,12 +399,12 @@ def test_mixed_integer(self): tm.assert_numpy_array_equal(result, expected) values = np.array(["b", 1, 0, "a"], dtype=object) - labels = [0, 1, 2, 3, 0, -1, 1] - result, result_labels = safe_sort(values, labels) + codes = [0, 1, 2, 3, 0, -1, 1] + result, result_codes = safe_sort(values, codes) expected = np.array([0, 1, "a", "b"], dtype=object) - expected_labels = np.array([3, 1, 0, 2, 3, -1, 1], dtype=np.intp) + expected_codes = np.array([3, 1, 0, 2, 3, -1, 1], dtype=np.intp) tm.assert_numpy_array_equal(result, expected) - tm.assert_numpy_array_equal(result_labels, expected_labels) + tm.assert_numpy_array_equal(result_codes, expected_codes) def test_mixed_integer_from_list(self): values = ["b", 1, 0, "a", 0, "b"] @@ -428,10 +428,10 @@ def test_exceptions(self): safe_sort(values=1) with pytest.raises(TypeError, match="Only list-like objects or None"): - safe_sort(values=[0, 1, 2], labels=1) + safe_sort(values=[0, 1, 2], codes=1) with pytest.raises(ValueError, match="values should be unique"): - safe_sort(values=[0, 1, 2, 1], labels=[0, 1]) + safe_sort(values=[0, 1, 2, 1], codes=[0, 1]) def test_extension_array(self): # a = array([1, 3, np.nan, 2], dtype='Int64') @@ -443,12 +443,12 @@ def test_extension_array(self): @pytest.mark.parametrize("verify", [True, False]) @pytest.mark.parametrize("na_sentinel", [-1, 99]) - def test_extension_array_labels(self, verify, na_sentinel): + def test_extension_array_codes(self, verify, na_sentinel): a = array([1, 3, 2], dtype="Int64") - result, labels = safe_sort( + result, codes = safe_sort( a, [0, 1, na_sentinel, 2], na_sentinel=na_sentinel, verify=verify ) expected_values = array([1, 2, 3], dtype="Int64") - expected_labels = np.array([0, 2, na_sentinel, 1], dtype=np.intp) + expected_codes = np.array([0, 2, na_sentinel, 1], dtype=np.intp) tm.assert_extension_array_equal(result, expected_values) - tm.assert_numpy_array_equal(labels, expected_labels) + tm.assert_numpy_array_equal(codes, expected_codes) From 9cd6049cbb51bd1deb763beef33fd16752f00171 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Tue, 12 Nov 2019 15:22:59 -0800 Subject: [PATCH 017/185] REF: de-duplicate _update_cached_objs (#29548) --- pandas/_libs/reduction.pyx | 46 +++++++++++++++++--------------------- 1 file changed, 21 insertions(+), 25 deletions(-) diff --git a/pandas/_libs/reduction.pyx b/pandas/_libs/reduction.pyx index a19226670ec0a..fa9c12777eb5b 100644 --- a/pandas/_libs/reduction.pyx +++ b/pandas/_libs/reduction.pyx @@ -194,6 +194,23 @@ cdef class _BaseGrouper: return values, index + cdef inline _update_cached_objs(self, object cached_typ, object cached_ityp, + Slider islider, Slider vslider, object name): + if cached_typ is None: + cached_ityp = self.ityp(islider.buf) + cached_typ = self.typ(vslider.buf, index=cached_ityp, name=name) + else: + # See the comment in indexes/base.py about _index_data. + # We need this for EA-backed indexes that have a reference + # to a 1-d ndarray like datetime / timedelta / period. + object.__setattr__(cached_ityp, '_index_data', islider.buf) + cached_ityp._engine.clear_mapping() + object.__setattr__(cached_typ._data._block, 'values', vslider.buf) + object.__setattr__(cached_typ, '_index', cached_ityp) + object.__setattr__(cached_typ, 'name', name) + + return cached_typ, cached_ityp + cdef class SeriesBinGrouper(_BaseGrouper): """ @@ -265,20 +282,8 @@ cdef class SeriesBinGrouper(_BaseGrouper): islider.set_length(group_size) vslider.set_length(group_size) - if cached_typ is None: - cached_ityp = self.ityp(islider.buf) - cached_typ = self.typ(vslider.buf, index=cached_ityp, - name=name) - else: - # See the comment in indexes/base.py about _index_data. - # We need this for EA-backed indexes that have a reference - # to a 1-d ndarray like datetime / timedelta / period. - object.__setattr__(cached_ityp, '_index_data', islider.buf) - cached_ityp._engine.clear_mapping() - object.__setattr__( - cached_typ._data._block, 'values', vslider.buf) - object.__setattr__(cached_typ, '_index', cached_ityp) - object.__setattr__(cached_typ, 'name', name) + cached_typ, cached_ityp = self._update_cached_objs( + cached_typ, cached_ityp, islider, vslider, name) cached_ityp._engine.clear_mapping() res = self.f(cached_typ) @@ -379,17 +384,8 @@ cdef class SeriesGrouper(_BaseGrouper): islider.set_length(group_size) vslider.set_length(group_size) - if cached_typ is None: - cached_ityp = self.ityp(islider.buf) - cached_typ = self.typ(vslider.buf, index=cached_ityp, - name=name) - else: - object.__setattr__(cached_ityp, '_data', islider.buf) - cached_ityp._engine.clear_mapping() - object.__setattr__( - cached_typ._data._block, 'values', vslider.buf) - object.__setattr__(cached_typ, '_index', cached_ityp) - object.__setattr__(cached_typ, 'name', name) + cached_typ, cached_ityp = self._update_cached_objs( + cached_typ, cached_ityp, islider, vslider, name) cached_ityp._engine.clear_mapping() res = self.f(cached_typ) From beb23bddcf35f021ba659d5695a62f21ba83e913 Mon Sep 17 00:00:00 2001 From: ganevgv Date: Tue, 12 Nov 2019 23:29:27 +0000 Subject: [PATCH 018/185] TST: add test for df comparing strings to numbers raises ValueError (#29535) --- pandas/tests/frame/test_operators.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/pandas/tests/frame/test_operators.py b/pandas/tests/frame/test_operators.py index 19d91241d6a6b..ac60f04248da5 100644 --- a/pandas/tests/frame/test_operators.py +++ b/pandas/tests/frame/test_operators.py @@ -530,6 +530,16 @@ def test_comp(func): test_comp(operator.ge) test_comp(operator.le) + def test_strings_to_numbers_comparisons_raises(self, compare_operators_no_eq_ne): + # GH 11565 + df = DataFrame( + {x: {"x": "foo", "y": "bar", "z": "baz"} for x in ["a", "b", "c"]} + ) + + f = getattr(operator, compare_operators_no_eq_ne) + with pytest.raises(TypeError): + f(df, 0) + def test_comparison_protected_from_errstate(self): missing_df = tm.makeDataFrame() missing_df.iloc[0]["A"] = np.nan From 29718c34b816eee657ce0738eddc110638beab9d Mon Sep 17 00:00:00 2001 From: ganevgv Date: Tue, 12 Nov 2019 23:32:45 +0000 Subject: [PATCH 019/185] TST: add tests for Series/DF logical operations nan propagation (#29531) --- pandas/tests/frame/test_operators.py | 36 +++++++++++++++++++++++++++ pandas/tests/series/test_operators.py | 36 +++++++++++++++++++++++++++ 2 files changed, 72 insertions(+) diff --git a/pandas/tests/frame/test_operators.py b/pandas/tests/frame/test_operators.py index ac60f04248da5..f3e61dffb500d 100644 --- a/pandas/tests/frame/test_operators.py +++ b/pandas/tests/frame/test_operators.py @@ -218,6 +218,42 @@ def test_logical_with_nas(self): expected = Series([True, True]) tm.assert_series_equal(result, expected) + @pytest.mark.parametrize( + "left, right, op, expected", + [ + ( + [True, False, np.nan], + [True, False, True], + operator.and_, + [True, False, False], + ), + ( + [True, False, True], + [True, False, np.nan], + operator.and_, + [True, False, False], + ), + ( + [True, False, np.nan], + [True, False, True], + operator.or_, + [True, False, False], + ), + ( + [True, False, True], + [True, False, np.nan], + operator.or_, + [True, False, True], + ), + ], + ) + def test_logical_operators_nans(self, left, right, op, expected): + # GH 13896 + result = op(DataFrame(left), DataFrame(right)) + expected = DataFrame(expected) + + tm.assert_frame_equal(result, expected) + class TestDataFrameOperators: @pytest.mark.parametrize( diff --git a/pandas/tests/series/test_operators.py b/pandas/tests/series/test_operators.py index 7d212ee7cd667..983560d68c28c 100644 --- a/pandas/tests/series/test_operators.py +++ b/pandas/tests/series/test_operators.py @@ -43,6 +43,42 @@ def test_logical_operators_bool_dtype_with_empty(self): expected = s_tft tm.assert_series_equal(res, expected) + @pytest.mark.parametrize( + "left, right, op, expected", + [ + ( + [True, False, np.nan], + [True, False, True], + operator.and_, + [True, False, False], + ), + ( + [True, False, True], + [True, False, np.nan], + operator.and_, + [True, False, False], + ), + ( + [True, False, np.nan], + [True, False, True], + operator.or_, + [True, False, False], + ), + ( + [True, False, True], + [True, False, np.nan], + operator.or_, + [True, False, True], + ), + ], + ) + def test_logical_operators_nans(self, left, right, op, expected): + # GH 13896 + result = op(Series(left), Series(right)) + expected = Series(expected) + + tm.assert_series_equal(result, expected) + def test_logical_operators_int_dtype_with_int_dtype(self): # GH#9016: support bitwise op for integer types From 17db6c565d6985989759fbb4c3b92f8ecd859de1 Mon Sep 17 00:00:00 2001 From: Terji Petersen Date: Tue, 12 Nov 2019 23:35:53 +0000 Subject: [PATCH 020/185] API: Remove kwargs from GroupBy (#29511) --- pandas/core/generic.py | 11 ++----- pandas/core/groupby/generic.py | 4 +-- pandas/core/groupby/groupby.py | 51 +++++++++++++++++++++-------- pandas/core/resample.py | 6 ++-- pandas/tests/window/test_grouper.py | 9 ++--- 5 files changed, 51 insertions(+), 30 deletions(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index c8ce7561a12b8..1b892c02ba7bf 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -7830,7 +7830,6 @@ def groupby( group_keys=True, squeeze=False, observed=False, - **kwargs ): """ Group DataFrame or Series using a mapper or by a Series of columns. @@ -7876,10 +7875,6 @@ def groupby( .. versionadded:: 0.23.0 - **kwargs - Optional, only accepts keyword argument 'mutated' and is passed - to groupby. - Returns ------- DataFrameGroupBy or SeriesGroupBy @@ -7941,12 +7936,13 @@ def groupby( Captive 210.0 Wild 185.0 """ - from pandas.core.groupby.groupby import groupby + from pandas.core.groupby.groupby import get_groupby if level is None and by is None: raise TypeError("You have to supply one of 'by' and 'level'") axis = self._get_axis_number(axis) - return groupby( + + return get_groupby( self, by=by, axis=axis, @@ -7956,7 +7952,6 @@ def groupby( group_keys=group_keys, squeeze=squeeze, observed=observed, - **kwargs ) def asfreq(self, freq, method=None, how=None, normalize=False, fill_value=None): diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 569f57346be4e..dda98d2dd438b 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -62,7 +62,7 @@ GroupBy, _apply_docs, _transform_template, - groupby, + get_groupby, ) from pandas.core.index import Index, MultiIndex, _all_indexes_same import pandas.core.indexes.base as ibase @@ -996,7 +996,7 @@ def _cython_agg_blocks( # reductions; see GH#28949 obj = obj.iloc[:, 0] - s = groupby(obj, self.grouper) + s = get_groupby(obj, self.grouper) try: result = s.aggregate(lambda x: alt(x, axis=self.axis)) except TypeError: diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index f9f20d13f9585..204346bb7b741 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -26,7 +26,6 @@ class providing the base-class of operations. from pandas.compat.numpy import function as nv from pandas.errors import AbstractMethodError from pandas.util._decorators import Appender, Substitution, cache_readonly -from pandas.util._validators import validate_kwargs from pandas.core.dtypes.cast import maybe_downcast_to_dtype from pandas.core.dtypes.common import ( @@ -349,12 +348,12 @@ def __init__( grouper=None, exclusions=None, selection=None, - as_index=True, - sort=True, - group_keys=True, - squeeze=False, - observed=False, - **kwargs + as_index: bool = True, + sort: bool = True, + group_keys: bool = True, + squeeze: bool = False, + observed: bool = False, + mutated: bool = False, ): self._selection = selection @@ -376,7 +375,7 @@ def __init__( self.group_keys = group_keys self.squeeze = squeeze self.observed = observed - self.mutated = kwargs.pop("mutated", False) + self.mutated = mutated if grouper is None: from pandas.core.groupby.grouper import get_grouper @@ -396,9 +395,6 @@ def __init__( self.grouper = grouper self.exclusions = set(exclusions) if exclusions else set() - # we accept no other args - validate_kwargs("group", kwargs, {}) - def __len__(self) -> int: return len(self.groups) @@ -2482,7 +2478,22 @@ def _reindex_output(self, output): @Appender(GroupBy.__doc__) -def groupby(obj: NDFrame, by, **kwds): +def get_groupby( + obj: NDFrame, + by=None, + axis: int = 0, + level=None, + grouper=None, + exclusions=None, + selection=None, + as_index: bool = True, + sort: bool = True, + group_keys: bool = True, + squeeze: bool = False, + observed: bool = False, + mutated: bool = False, +): + if isinstance(obj, Series): from pandas.core.groupby.generic import SeriesGroupBy @@ -2496,4 +2507,18 @@ def groupby(obj: NDFrame, by, **kwds): else: raise TypeError("invalid type: {obj}".format(obj=obj)) - return klass(obj, by, **kwds) + return klass( + obj=obj, + keys=by, + axis=axis, + level=level, + grouper=grouper, + exclusions=exclusions, + selection=selection, + as_index=as_index, + sort=sort, + group_keys=group_keys, + squeeze=squeeze, + observed=observed, + mutated=mutated, + ) diff --git a/pandas/core/resample.py b/pandas/core/resample.py index 6d877bf666881..d980d5ba0be6e 100644 --- a/pandas/core/resample.py +++ b/pandas/core/resample.py @@ -21,7 +21,7 @@ from pandas.core.generic import _shared_docs from pandas.core.groupby.base import GroupByMixin from pandas.core.groupby.generic import SeriesGroupBy -from pandas.core.groupby.groupby import GroupBy, _GroupBy, _pipe_template, groupby +from pandas.core.groupby.groupby import GroupBy, _GroupBy, _pipe_template, get_groupby from pandas.core.groupby.grouper import Grouper from pandas.core.groupby.ops import BinGrouper from pandas.core.indexes.datetimes import DatetimeIndex, date_range @@ -334,7 +334,7 @@ def _gotitem(self, key, ndim, subset=None): grouper = self.grouper if subset is None: subset = self.obj - grouped = groupby(subset, by=None, grouper=grouper, axis=self.axis) + grouped = get_groupby(subset, by=None, grouper=grouper, axis=self.axis) # try the key selection try: @@ -353,7 +353,7 @@ def _groupby_and_aggregate(self, how, grouper=None, *args, **kwargs): obj = self._selected_obj - grouped = groupby(obj, by=None, grouper=grouper, axis=self.axis) + grouped = get_groupby(obj, by=None, grouper=grouper, axis=self.axis) try: if isinstance(obj, ABCDataFrame) and callable(how): diff --git a/pandas/tests/window/test_grouper.py b/pandas/tests/window/test_grouper.py index b726bd3e3c8a7..c278897e1d395 100644 --- a/pandas/tests/window/test_grouper.py +++ b/pandas/tests/window/test_grouper.py @@ -3,6 +3,7 @@ import pandas as pd from pandas import DataFrame, Series +from pandas.core.groupby.groupby import get_groupby import pandas.util.testing as tm @@ -13,18 +14,18 @@ def setup_method(self, method): def test_mutated(self): - msg = r"group\(\) got an unexpected keyword argument 'foo'" + msg = r"groupby\(\) got an unexpected keyword argument 'foo'" with pytest.raises(TypeError, match=msg): self.frame.groupby("A", foo=1) g = self.frame.groupby("A") assert not g.mutated - g = self.frame.groupby("A", mutated=True) + g = get_groupby(self.frame, by="A", mutated=True) assert g.mutated def test_getitem(self): g = self.frame.groupby("A") - g_mutated = self.frame.groupby("A", mutated=True) + g_mutated = get_groupby(self.frame, by="A", mutated=True) expected = g_mutated.B.apply(lambda x: x.rolling(2).mean()) @@ -45,7 +46,7 @@ def test_getitem_multiple(self): # GH 13174 g = self.frame.groupby("A") r = g.rolling(2) - g_mutated = self.frame.groupby("A", mutated=True) + g_mutated = get_groupby(self.frame, by="A", mutated=True) expected = g_mutated.B.apply(lambda x: x.rolling(2).count()) result = r.B.count() From eabf89d174ed8164a09ae28c80e03e1fe5bf14c5 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Tue, 12 Nov 2019 15:38:26 -0800 Subject: [PATCH 021/185] CLN: annotations in core.dtypes (#29503) --- pandas/core/dtypes/base.py | 17 +++++++---- pandas/core/dtypes/cast.py | 34 +++++++++++---------- pandas/core/dtypes/common.py | 53 +++++++++++++++++---------------- pandas/core/dtypes/concat.py | 10 ++++--- pandas/core/dtypes/dtypes.py | 26 ++++++++-------- pandas/core/dtypes/generic.py | 4 +-- pandas/core/dtypes/inference.py | 24 +++++++-------- pandas/core/dtypes/missing.py | 13 ++++---- 8 files changed, 97 insertions(+), 84 deletions(-) diff --git a/pandas/core/dtypes/base.py b/pandas/core/dtypes/base.py index 2cc7c44cc05af..d9d3b0d45e218 100644 --- a/pandas/core/dtypes/base.py +++ b/pandas/core/dtypes/base.py @@ -87,7 +87,8 @@ def __str__(self) -> str: return self.name def __eq__(self, other): - """Check whether 'other' is equal to self. + """ + Check whether 'other' is equal to self. By default, 'other' is considered equal if either @@ -115,7 +116,7 @@ def __eq__(self, other): ) return False - def __hash__(self): + def __hash__(self) -> int: return hash(tuple(getattr(self, attr) for attr in self._metadata)) def __ne__(self, other): @@ -171,7 +172,8 @@ def name(self) -> str: @property def names(self) -> Optional[List[str]]: - """Ordered list of field names, or None if there are no fields. + """ + Ordered list of field names, or None if there are no fields. This is for compatibility with NumPy arrays, and may be removed in the future. @@ -233,16 +235,19 @@ def construct_from_string(cls, string: str): ... "'{}'".format(cls.__name__, string)) """ if not isinstance(string, str): - raise TypeError("Expects a string, got {}".format(type(string))) + raise TypeError("Expects a string, got {typ}".format(typ=type(string))) if string != cls.name: raise TypeError( - "Cannot construct a '{}' from '{}'".format(cls.__name__, string) + "Cannot construct a '{cls}' from '{string}'".format( + cls=cls.__name__, string=string + ) ) return cls() @classmethod def is_dtype(cls, dtype) -> bool: - """Check if we match 'dtype'. + """ + Check if we match 'dtype'. Parameters ---------- diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index 637c42eef8a5a..acf8b6ca4e312 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -72,7 +72,7 @@ def maybe_convert_platform(values): return values -def is_nested_object(obj): +def is_nested_object(obj) -> bool: """ return a boolean if we have a nested object, e.g. a Series with 1 or more Series elements @@ -500,11 +500,11 @@ def _ensure_dtype_type(value, dtype): def infer_dtype_from(val, pandas_dtype: bool = False): """ - interpret the dtype from a scalar or array. This is a convenience - routines to infer dtype from a scalar or an array + Interpret the dtype from a scalar or array. Parameters ---------- + val : object pandas_dtype : bool, default False whether to infer dtype including pandas extension types. If False, scalar/array belongs to pandas extension types is inferred as @@ -517,7 +517,7 @@ def infer_dtype_from(val, pandas_dtype: bool = False): def infer_dtype_from_scalar(val, pandas_dtype: bool = False): """ - interpret the dtype from a scalar + Interpret the dtype from a scalar. Parameters ---------- @@ -592,7 +592,7 @@ def infer_dtype_from_scalar(val, pandas_dtype: bool = False): def infer_dtype_from_array(arr, pandas_dtype: bool = False): """ - infer the dtype from a scalar or array + Infer the dtype from a scalar or array. Parameters ---------- @@ -647,7 +647,8 @@ def infer_dtype_from_array(arr, pandas_dtype: bool = False): def maybe_infer_dtype_type(element): - """Try to infer an object's dtype, for use in arithmetic ops + """ + Try to infer an object's dtype, for use in arithmetic ops. Uses `element.dtype` if that's available. Objects implementing the iterator protocol are cast to a NumPy array, @@ -679,8 +680,9 @@ def maybe_infer_dtype_type(element): return tipo -def maybe_upcast(values, fill_value=np.nan, dtype=None, copy=False): - """ provide explicit type promotion and coercion +def maybe_upcast(values, fill_value=np.nan, dtype=None, copy: bool = False): + """ + Provide explicit type promotion and coercion. Parameters ---------- @@ -759,7 +761,7 @@ def conv(r, dtype): return [conv(r, dtype) for r, dtype in zip(result, dtypes)] -def astype_nansafe(arr, dtype, copy=True, skipna=False): +def astype_nansafe(arr, dtype, copy: bool = True, skipna: bool = False): """ Cast the elements of an array to a given dtype a nan-safe manner. @@ -982,7 +984,7 @@ def soft_convert_objects( return values -def maybe_castable(arr): +def maybe_castable(arr) -> bool: # return False to force a non-fastpath # check datetime64[ns]/timedelta64[ns] are valid @@ -996,7 +998,7 @@ def maybe_castable(arr): return arr.dtype.name not in _POSSIBLY_CAST_DTYPES -def maybe_infer_to_datetimelike(value, convert_dates=False): +def maybe_infer_to_datetimelike(value, convert_dates: bool = False): """ we might have a array (or single object) that is datetime like, and no dtype is passed don't change the value unless we find a @@ -1103,7 +1105,7 @@ def try_timedelta(v): return value -def maybe_cast_to_datetime(value, dtype, errors="raise"): +def maybe_cast_to_datetime(value, dtype, errors: str = "raise"): """ try to cast the array/value to a datetimelike dtype, converting float nan to iNaT """ @@ -1292,7 +1294,7 @@ def find_common_type(types): def cast_scalar_to_array(shape, value, dtype=None): """ - create np.ndarray of specified shape and dtype, filled with values + Create np.ndarray of specified shape and dtype, filled with values. Parameters ---------- @@ -1318,7 +1320,7 @@ def cast_scalar_to_array(shape, value, dtype=None): return values -def construct_1d_arraylike_from_scalar(value, length, dtype): +def construct_1d_arraylike_from_scalar(value, length: int, dtype): """ create a np.ndarray / pandas type of specified shape and dtype filled with values @@ -1383,7 +1385,7 @@ def construct_1d_object_array_from_listlike(values): return result -def construct_1d_ndarray_preserving_na(values, dtype=None, copy=False): +def construct_1d_ndarray_preserving_na(values, dtype=None, copy: bool = False): """ Construct a new ndarray, coercing `values` to `dtype`, preserving NA. @@ -1424,7 +1426,7 @@ def construct_1d_ndarray_preserving_na(values, dtype=None, copy=False): return subarr -def maybe_cast_to_integer_array(arr, dtype, copy=False): +def maybe_cast_to_integer_array(arr, dtype, copy: bool = False): """ Takes any dtype and returns the casted version, raising for when data is incompatible with integer/unsigned integer dtypes. diff --git a/pandas/core/dtypes/common.py b/pandas/core/dtypes/common.py index 4f9481eccb836..1ed54c12f4a34 100644 --- a/pandas/core/dtypes/common.py +++ b/pandas/core/dtypes/common.py @@ -222,7 +222,7 @@ def classes_and_not_datetimelike(*klasses) -> Callable: ) -def is_object_dtype(arr_or_dtype): +def is_object_dtype(arr_or_dtype) -> bool: """ Check whether an array-like or dtype is of the object dtype. @@ -252,7 +252,7 @@ def is_object_dtype(arr_or_dtype): return _is_dtype_type(arr_or_dtype, classes(np.object_)) -def is_sparse(arr): +def is_sparse(arr) -> bool: """ Check whether an array-like is a 1-D pandas sparse array. @@ -304,7 +304,7 @@ def is_sparse(arr): return isinstance(dtype, SparseDtype) -def is_scipy_sparse(arr): +def is_scipy_sparse(arr) -> bool: """ Check whether an array-like is a scipy.sparse.spmatrix instance. @@ -339,6 +339,7 @@ def is_scipy_sparse(arr): except ImportError: _is_scipy_sparse = lambda _: False + assert _is_scipy_sparse is not None return _is_scipy_sparse(arr) @@ -375,7 +376,7 @@ def is_categorical(arr) -> bool: return isinstance(arr, ABCCategorical) or is_categorical_dtype(arr) -def is_datetimetz(arr): +def is_datetimetz(arr) -> bool: """ Check whether an array-like is a datetime array-like with a timezone component in its dtype. @@ -425,7 +426,7 @@ def is_datetimetz(arr): return is_datetime64tz_dtype(arr) -def is_offsetlike(arr_or_obj): +def is_offsetlike(arr_or_obj) -> bool: """ Check if obj or all elements of list-like is DateOffset @@ -456,7 +457,7 @@ def is_offsetlike(arr_or_obj): return False -def is_period(arr): +def is_period(arr) -> bool: """ Check whether an array-like is a periodical index. @@ -493,7 +494,7 @@ def is_period(arr): return isinstance(arr, ABCPeriodIndex) or is_period_arraylike(arr) -def is_datetime64_dtype(arr_or_dtype): +def is_datetime64_dtype(arr_or_dtype) -> bool: """ Check whether an array-like or dtype is of the datetime64 dtype. @@ -524,7 +525,7 @@ def is_datetime64_dtype(arr_or_dtype): return _is_dtype_type(arr_or_dtype, classes(np.datetime64)) -def is_datetime64tz_dtype(arr_or_dtype): +def is_datetime64tz_dtype(arr_or_dtype) -> bool: """ Check whether an array-like or dtype is of a DatetimeTZDtype dtype. @@ -562,7 +563,7 @@ def is_datetime64tz_dtype(arr_or_dtype): return DatetimeTZDtype.is_dtype(arr_or_dtype) -def is_timedelta64_dtype(arr_or_dtype): +def is_timedelta64_dtype(arr_or_dtype) -> bool: """ Check whether an array-like or dtype is of the timedelta64 dtype. @@ -593,7 +594,7 @@ def is_timedelta64_dtype(arr_or_dtype): return _is_dtype_type(arr_or_dtype, classes(np.timedelta64)) -def is_period_dtype(arr_or_dtype): +def is_period_dtype(arr_or_dtype) -> bool: """ Check whether an array-like or dtype is of the Period dtype. @@ -627,7 +628,7 @@ def is_period_dtype(arr_or_dtype): return PeriodDtype.is_dtype(arr_or_dtype) -def is_interval_dtype(arr_or_dtype): +def is_interval_dtype(arr_or_dtype) -> bool: """ Check whether an array-like or dtype is of the Interval dtype. @@ -696,7 +697,7 @@ def is_categorical_dtype(arr_or_dtype) -> bool: return CategoricalDtype.is_dtype(arr_or_dtype) -def is_string_dtype(arr_or_dtype): +def is_string_dtype(arr_or_dtype) -> bool: """ Check whether the provided array or dtype is of the string dtype. @@ -732,7 +733,7 @@ def condition(dtype): return _is_dtype(arr_or_dtype, condition) -def is_period_arraylike(arr): +def is_period_arraylike(arr) -> bool: """ Check whether an array-like is a periodical array-like or PeriodIndex. @@ -764,7 +765,7 @@ def is_period_arraylike(arr): return getattr(arr, "inferred_type", None) == "period" -def is_datetime_arraylike(arr): +def is_datetime_arraylike(arr) -> bool: """ Check whether an array-like is a datetime array-like or DatetimeIndex. @@ -799,7 +800,7 @@ def is_datetime_arraylike(arr): return getattr(arr, "inferred_type", None) == "datetime" -def is_dtype_equal(source, target): +def is_dtype_equal(source, target) -> bool: """ Check if two dtypes are equal. @@ -889,7 +890,7 @@ def is_any_int_dtype(arr_or_dtype) -> bool: return _is_dtype_type(arr_or_dtype, classes(np.integer, np.timedelta64)) -def is_integer_dtype(arr_or_dtype): +def is_integer_dtype(arr_or_dtype) -> bool: """ Check whether the provided array or dtype is of an integer dtype. @@ -944,7 +945,7 @@ def is_integer_dtype(arr_or_dtype): return _is_dtype_type(arr_or_dtype, classes_and_not_datetimelike(np.integer)) -def is_signed_integer_dtype(arr_or_dtype): +def is_signed_integer_dtype(arr_or_dtype) -> bool: """ Check whether the provided array or dtype is of a signed integer dtype. @@ -1001,7 +1002,7 @@ def is_signed_integer_dtype(arr_or_dtype): return _is_dtype_type(arr_or_dtype, classes_and_not_datetimelike(np.signedinteger)) -def is_unsigned_integer_dtype(arr_or_dtype): +def is_unsigned_integer_dtype(arr_or_dtype) -> bool: """ Check whether the provided array or dtype is of an unsigned integer dtype. @@ -1050,7 +1051,7 @@ def is_unsigned_integer_dtype(arr_or_dtype): ) -def is_int64_dtype(arr_or_dtype): +def is_int64_dtype(arr_or_dtype) -> bool: """ Check whether the provided array or dtype is of the int64 dtype. @@ -1141,7 +1142,7 @@ def is_datetime64_any_dtype(arr_or_dtype) -> bool: return is_datetime64_dtype(arr_or_dtype) or is_datetime64tz_dtype(arr_or_dtype) -def is_datetime64_ns_dtype(arr_or_dtype): +def is_datetime64_ns_dtype(arr_or_dtype) -> bool: """ Check whether the provided array or dtype is of the datetime64[ns] dtype. @@ -1191,7 +1192,7 @@ def is_datetime64_ns_dtype(arr_or_dtype): return tipo == _NS_DTYPE or getattr(tipo, "base", None) == _NS_DTYPE -def is_timedelta64_ns_dtype(arr_or_dtype): +def is_timedelta64_ns_dtype(arr_or_dtype) -> bool: """ Check whether the provided array or dtype is of the timedelta64[ns] dtype. @@ -1222,7 +1223,7 @@ def is_timedelta64_ns_dtype(arr_or_dtype): return _is_dtype(arr_or_dtype, lambda dtype: dtype == _TD_DTYPE) -def is_datetime_or_timedelta_dtype(arr_or_dtype): +def is_datetime_or_timedelta_dtype(arr_or_dtype) -> bool: """ Check whether the provided array or dtype is of a timedelta64 or datetime64 dtype. @@ -1285,7 +1286,7 @@ def _is_unorderable_exception(e: TypeError) -> bool: return "unorderable" in str(e) -def needs_i8_conversion(arr_or_dtype): +def needs_i8_conversion(arr_or_dtype) -> bool: """ Check whether the array or dtype should be converted to int64. @@ -1329,7 +1330,7 @@ def needs_i8_conversion(arr_or_dtype): ) -def is_numeric_dtype(arr_or_dtype): +def is_numeric_dtype(arr_or_dtype) -> bool: """ Check whether the provided array or dtype is of a numeric dtype. @@ -1372,7 +1373,7 @@ def is_numeric_dtype(arr_or_dtype): ) -def is_string_like_dtype(arr_or_dtype): +def is_string_like_dtype(arr_or_dtype) -> bool: """ Check whether the provided array or dtype is of a string-like dtype. @@ -1404,7 +1405,7 @@ def is_string_like_dtype(arr_or_dtype): return _is_dtype(arr_or_dtype, lambda dtype: dtype.kind in ("S", "U")) -def is_float_dtype(arr_or_dtype): +def is_float_dtype(arr_or_dtype) -> bool: """ Check whether the provided array or dtype is of a float dtype. diff --git a/pandas/core/dtypes/concat.py b/pandas/core/dtypes/concat.py index a62d3d0f4e65b..768272e173c82 100644 --- a/pandas/core/dtypes/concat.py +++ b/pandas/core/dtypes/concat.py @@ -69,7 +69,7 @@ def get_dtype_kinds(l): return typs -def concat_compat(to_concat, axis=0): +def concat_compat(to_concat, axis: int = 0): """ provide concatenation of an array of arrays each of which is a single 'normalized' dtypes (in that for example, if it's object, then it is a @@ -137,7 +137,7 @@ def is_nonempty(x): return np.concatenate(to_concat, axis=axis) -def concat_categorical(to_concat, axis=0): +def concat_categorical(to_concat, axis: int = 0): """Concatenate an object/categorical array of arrays, each of which is a single dtype @@ -183,7 +183,9 @@ def concat_categorical(to_concat, axis=0): return result -def union_categoricals(to_union, sort_categories=False, ignore_order=False): +def union_categoricals( + to_union, sort_categories: bool = False, ignore_order: bool = False +): """ Combine list-like of Categorical-like, unioning categories. @@ -355,7 +357,7 @@ def _maybe_unwrap(x): return Categorical(new_codes, categories=categories, ordered=ordered, fastpath=True) -def _concatenate_2d(to_concat, axis): +def _concatenate_2d(to_concat, axis: int): # coerce to 2d if needed & concatenate if axis == 1: to_concat = [np.atleast_2d(x) for x in to_concat] diff --git a/pandas/core/dtypes/dtypes.py b/pandas/core/dtypes/dtypes.py index 4a4ad076f14ca..a0712a0df237b 100644 --- a/pandas/core/dtypes/dtypes.py +++ b/pandas/core/dtypes/dtypes.py @@ -416,12 +416,12 @@ def __eq__(self, other: Any) -> bool: return hash(self) == hash(other) def __repr__(self) -> str_type: - tpl = "CategoricalDtype(categories={}ordered={})" + tpl = "CategoricalDtype(categories={data}ordered={ordered})" if self.categories is None: data = "None, " else: data = self.categories._format_data(name=self.__class__.__name__) - return tpl.format(data, self._ordered) + return tpl.format(data=data, ordered=self._ordered) @staticmethod def _hash_categories(categories, ordered: Ordered = True) -> int: @@ -719,7 +719,7 @@ def construct_array_type(cls): return DatetimeArray @classmethod - def construct_from_string(cls, string): + def construct_from_string(cls, string: str_type): """ Construct a DatetimeTZDtype from a string. @@ -736,7 +736,7 @@ def construct_from_string(cls, string): datetime64[ns, UTC] """ if isinstance(string, str): - msg = "Could not construct DatetimeTZDtype from '{}'" + msg = "Could not construct DatetimeTZDtype from '{string}'" match = cls._match.match(string) if match: d = match.groupdict() @@ -747,8 +747,8 @@ def construct_from_string(cls, string): # pytz timezone (actually pytz.UnknownTimeZoneError). # TypeError if we pass a nonsense tz; # ValueError if we pass a unit other than "ns" - raise TypeError(msg.format(string)) from err - raise TypeError(msg.format(string)) + raise TypeError(msg.format(string=string)) from err + raise TypeError(msg.format(string=string)) raise TypeError("Could not construct DatetimeTZDtype") @@ -756,11 +756,11 @@ def __str__(self) -> str_type: return "datetime64[{unit}, {tz}]".format(unit=self.unit, tz=self.tz) @property - def name(self): + def name(self) -> str_type: """A string representation of the dtype.""" return str(self) - def __hash__(self): + def __hash__(self) -> int: # make myself hashable # TODO: update this. return hash(str(self)) @@ -893,14 +893,14 @@ def __str__(self) -> str_type: return self.name @property - def name(self): + def name(self) -> str_type: return "period[{freq}]".format(freq=self.freq.freqstr) @property def na_value(self): return NaT - def __hash__(self): + def __hash__(self) -> int: # make myself hashable return hash(str(self)) @@ -917,7 +917,7 @@ def __setstate__(self, state): self._freq = state["freq"] @classmethod - def is_dtype(cls, dtype): + def is_dtype(cls, dtype) -> bool: """ Return a boolean if we if the passed type is an actual dtype that we can match (via string or type) @@ -1073,7 +1073,7 @@ def __str__(self) -> str_type: return "interval" return "interval[{subtype}]".format(subtype=self.subtype) - def __hash__(self): + def __hash__(self) -> int: # make myself hashable return hash(str(self)) @@ -1097,7 +1097,7 @@ def __setstate__(self, state): self._subtype = state["subtype"] @classmethod - def is_dtype(cls, dtype): + def is_dtype(cls, dtype) -> bool: """ Return a boolean if we if the passed type is an actual dtype that we can match (via string or type) diff --git a/pandas/core/dtypes/generic.py b/pandas/core/dtypes/generic.py index 2518f330b26a3..aa0f7d2aba1fc 100644 --- a/pandas/core/dtypes/generic.py +++ b/pandas/core/dtypes/generic.py @@ -5,7 +5,7 @@ # objects def create_pandas_abc_type(name, attr, comp): @classmethod - def _check(cls, inst): + def _check(cls, inst) -> bool: return getattr(inst, attr, "_typ") in comp dct = dict(__instancecheck__=_check, __subclasscheck__=_check) @@ -74,7 +74,7 @@ def _check(cls, inst): class _ABCGeneric(type): - def __instancecheck__(cls, inst): + def __instancecheck__(cls, inst) -> bool: return hasattr(inst, "_data") diff --git a/pandas/core/dtypes/inference.py b/pandas/core/dtypes/inference.py index 61fa7940c1bce..9e9278052e35d 100644 --- a/pandas/core/dtypes/inference.py +++ b/pandas/core/dtypes/inference.py @@ -26,7 +26,7 @@ is_list_like = lib.is_list_like -def is_number(obj): +def is_number(obj) -> bool: """ Check if the object is a number. @@ -67,7 +67,7 @@ def is_number(obj): return isinstance(obj, (Number, np.number)) -def _iterable_not_string(obj): +def _iterable_not_string(obj) -> bool: """ Check if the object is an iterable but not a string. @@ -93,7 +93,7 @@ def _iterable_not_string(obj): return isinstance(obj, abc.Iterable) and not isinstance(obj, str) -def is_iterator(obj): +def is_iterator(obj) -> bool: """ Check if the object is an iterator. @@ -127,7 +127,7 @@ def is_iterator(obj): return hasattr(obj, "__next__") -def is_file_like(obj): +def is_file_like(obj) -> bool: """ Check if the object is a file-like object. @@ -165,7 +165,7 @@ def is_file_like(obj): return True -def is_re(obj): +def is_re(obj) -> bool: """ Check if the object is a regex pattern instance. @@ -188,7 +188,7 @@ def is_re(obj): return isinstance(obj, Pattern) -def is_re_compilable(obj): +def is_re_compilable(obj) -> bool: """ Check if the object can be compiled into a regex pattern instance. @@ -217,7 +217,7 @@ def is_re_compilable(obj): return True -def is_array_like(obj): +def is_array_like(obj) -> bool: """ Check if the object is array-like. @@ -250,7 +250,7 @@ def is_array_like(obj): return is_list_like(obj) and hasattr(obj, "dtype") -def is_nested_list_like(obj): +def is_nested_list_like(obj) -> bool: """ Check if the object is list-like, and that all of its elements are also list-like. @@ -296,7 +296,7 @@ def is_nested_list_like(obj): ) -def is_dict_like(obj): +def is_dict_like(obj) -> bool: """ Check if the object is dict-like. @@ -328,7 +328,7 @@ def is_dict_like(obj): ) -def is_named_tuple(obj): +def is_named_tuple(obj) -> bool: """ Check if the object is a named tuple. @@ -355,7 +355,7 @@ def is_named_tuple(obj): return isinstance(obj, tuple) and hasattr(obj, "_fields") -def is_hashable(obj): +def is_hashable(obj) -> bool: """ Return True if hash(obj) will succeed, False otherwise. @@ -392,7 +392,7 @@ def is_hashable(obj): return True -def is_sequence(obj): +def is_sequence(obj) -> bool: """ Check if the object is a sequence of objects. String types are not included as sequences here. diff --git a/pandas/core/dtypes/missing.py b/pandas/core/dtypes/missing.py index 0a8f636b4cb2a..df89bd374f22e 100644 --- a/pandas/core/dtypes/missing.py +++ b/pandas/core/dtypes/missing.py @@ -157,7 +157,8 @@ def _isna_new(obj): def _isna_old(obj): - """Detect missing values. Treat None, NaN, INF, -INF as null. + """ + Detect missing values, treating None, NaN, INF, -INF as null. Parameters ---------- @@ -190,7 +191,9 @@ def _isna_old(obj): def _use_inf_as_na(key): - """Option change callback for na/inf behaviour + """ + Option change callback for na/inf behaviour. + Choose which replacement for numpy.isnan / -numpy.isfinite is used. Parameters @@ -389,7 +392,7 @@ def _isna_compat(arr, fill_value=np.nan): return True -def array_equivalent(left, right, strict_nan=False): +def array_equivalent(left, right, strict_nan: bool = False): """ True if two arrays, left and right, have equal non-NaN elements, and NaNs in corresponding locations. False otherwise. It is assumed that left and @@ -508,7 +511,7 @@ def _maybe_fill(arr, fill_value=np.nan): return arr -def na_value_for_dtype(dtype, compat=True): +def na_value_for_dtype(dtype, compat: bool = True): """ Return a dtype compat na value @@ -566,7 +569,7 @@ def remove_na_arraylike(arr): return arr[notna(lib.values_from_object(arr))] -def is_valid_nat_for_dtype(obj, dtype): +def is_valid_nat_for_dtype(obj, dtype) -> bool: """ isna check that excludes incompatible dtypes From 93cb6fb8d82e3cd8641f18ffb3f1fbc29d05f772 Mon Sep 17 00:00:00 2001 From: ganevgv Date: Tue, 12 Nov 2019 23:39:09 +0000 Subject: [PATCH 022/185] TST: add test for df.where() with int dtype (#29498) --- pandas/tests/frame/test_dtypes.py | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/pandas/tests/frame/test_dtypes.py b/pandas/tests/frame/test_dtypes.py index 89fd7ccd91f81..cdcd5996324da 100644 --- a/pandas/tests/frame/test_dtypes.py +++ b/pandas/tests/frame/test_dtypes.py @@ -850,6 +850,31 @@ def test_astype_column_metadata(self, dtype): df = df.astype(dtype) tm.assert_index_equal(df.columns, columns) + def test_df_where_change_dtype(self): + # GH 16979 + df = DataFrame(np.arange(2 * 3).reshape(2, 3), columns=list("ABC")) + mask = np.array([[True, False, False], [False, False, True]]) + + result = df.where(mask) + expected = DataFrame( + [[0, np.nan, np.nan], [np.nan, np.nan, 5]], columns=list("ABC") + ) + + tm.assert_frame_equal(result, expected) + + # change type to category + df.A = df.A.astype("category") + df.B = df.B.astype("category") + df.C = df.C.astype("category") + + result = df.where(mask) + A = pd.Categorical([0, np.nan], categories=[0, 3]) + B = pd.Categorical([np.nan, np.nan], categories=[1, 4]) + C = pd.Categorical([np.nan, 5], categories=[2, 5]) + expected = DataFrame({"A": A, "B": B, "C": C}) + + tm.assert_frame_equal(result, expected) + @pytest.mark.parametrize("dtype", ["M8", "m8"]) @pytest.mark.parametrize("unit", ["ns", "us", "ms", "s", "h", "m", "D"]) def test_astype_from_datetimelike_to_objectt(self, dtype, unit): From 808f482bab754f46c261e96f1305b7a1e9c30f54 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Tue, 12 Nov 2019 15:43:04 -0800 Subject: [PATCH 023/185] CLN: annotation in reshape.merge (#29490) --- pandas/core/reshape/merge.py | 114 ++++++++++++++++++++--------------- 1 file changed, 66 insertions(+), 48 deletions(-) diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index bc23d50c634d5..2674b7ee95088 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -6,6 +6,7 @@ import datetime from functools import partial import string +from typing import TYPE_CHECKING, Optional, Tuple, Union import warnings import numpy as np @@ -39,6 +40,7 @@ from pandas.core.dtypes.missing import isna, na_value_for_dtype from pandas import Categorical, Index, MultiIndex +from pandas._typing import FrameOrSeries import pandas.core.algorithms as algos from pandas.core.arrays.categorical import _recode_for_categories import pandas.core.common as com @@ -46,22 +48,25 @@ from pandas.core.internals import _transform_index, concatenate_block_managers from pandas.core.sorting import is_int64_overflow_possible +if TYPE_CHECKING: + from pandas import DataFrame, Series # noqa:F401 + @Substitution("\nleft : DataFrame") @Appender(_merge_doc, indents=0) def merge( left, right, - how="inner", + how: str = "inner", on=None, left_on=None, right_on=None, - left_index=False, - right_index=False, - sort=False, + left_index: bool = False, + right_index: bool = False, + sort: bool = False, suffixes=("_x", "_y"), - copy=True, - indicator=False, + copy: bool = True, + indicator: bool = False, validate=None, ): op = _MergeOperation( @@ -86,7 +91,9 @@ def merge( merge.__doc__ = _merge_doc % "\nleft : DataFrame" -def _groupby_and_merge(by, on, left, right, _merge_pieces, check_duplicates=True): +def _groupby_and_merge( + by, on, left, right, _merge_pieces, check_duplicates: bool = True +): """ groupby & merge; we are always performing a left-by type operation @@ -172,7 +179,7 @@ def merge_ordered( right_by=None, fill_method=None, suffixes=("_x", "_y"), - how="outer", + how: str = "outer", ): """ Perform merge with optional filling/interpolation. @@ -298,14 +305,14 @@ def merge_asof( on=None, left_on=None, right_on=None, - left_index=False, - right_index=False, + left_index: bool = False, + right_index: bool = False, by=None, left_by=None, right_by=None, suffixes=("_x", "_y"), tolerance=None, - allow_exact_matches=True, + allow_exact_matches: bool = True, direction="backward", ): """ @@ -533,33 +540,33 @@ def merge_asof( # TODO: only copy DataFrames when modification necessary class _MergeOperation: """ - Perform a database (SQL) merge operation between two DataFrame objects - using either columns as keys or their row indexes + Perform a database (SQL) merge operation between two DataFrame or Series + objects using either columns as keys or their row indexes """ _merge_type = "merge" def __init__( self, - left, - right, - how="inner", + left: Union["Series", "DataFrame"], + right: Union["Series", "DataFrame"], + how: str = "inner", on=None, left_on=None, right_on=None, axis=1, - left_index=False, - right_index=False, - sort=True, + left_index: bool = False, + right_index: bool = False, + sort: bool = True, suffixes=("_x", "_y"), - copy=True, - indicator=False, + copy: bool = True, + indicator: bool = False, validate=None, ): - left = validate_operand(left) - right = validate_operand(right) - self.left = self.orig_left = left - self.right = self.orig_right = right + _left = _validate_operand(left) + _right = _validate_operand(right) + self.left = self.orig_left = _validate_operand(_left) # type: "DataFrame" + self.right = self.orig_right = _validate_operand(_right) # type: "DataFrame" self.how = how self.axis = axis @@ -577,7 +584,7 @@ def __init__( self.indicator = indicator if isinstance(self.indicator, str): - self.indicator_name = self.indicator + self.indicator_name = self.indicator # type: Optional[str] elif isinstance(self.indicator, bool): self.indicator_name = "_merge" if self.indicator else None else: @@ -597,11 +604,11 @@ def __init__( ) # warn user when merging between different levels - if left.columns.nlevels != right.columns.nlevels: + if _left.columns.nlevels != _right.columns.nlevels: msg = ( "merging between different levels can give an unintended " "result ({left} levels on the left, {right} on the right)" - ).format(left=left.columns.nlevels, right=right.columns.nlevels) + ).format(left=_left.columns.nlevels, right=_right.columns.nlevels) warnings.warn(msg, UserWarning) self._validate_specification() @@ -658,7 +665,9 @@ def get_result(self): return result - def _indicator_pre_merge(self, left, right): + def _indicator_pre_merge( + self, left: "DataFrame", right: "DataFrame" + ) -> Tuple["DataFrame", "DataFrame"]: columns = left.columns.union(right.columns) @@ -878,7 +887,12 @@ def _get_join_info(self): return join_index, left_indexer, right_indexer def _create_join_index( - self, index, other_index, indexer, other_indexer, how="left" + self, + index: Index, + other_index: Index, + indexer, + other_indexer, + how: str = "left", ): """ Create a join index by rearranging one index to match another @@ -1263,7 +1277,9 @@ def _validate(self, validate: str): raise ValueError("Not a valid argument for validate") -def _get_join_indexers(left_keys, right_keys, sort=False, how="inner", **kwargs): +def _get_join_indexers( + left_keys, right_keys, sort: bool = False, how: str = "inner", **kwargs +): """ Parameters @@ -1410,13 +1426,13 @@ def __init__( on=None, left_on=None, right_on=None, - left_index=False, - right_index=False, + left_index: bool = False, + right_index: bool = False, axis=1, suffixes=("_x", "_y"), - copy=True, + copy: bool = True, fill_method=None, - how="outer", + how: str = "outer", ): self.fill_method = fill_method @@ -1508,18 +1524,18 @@ def __init__( on=None, left_on=None, right_on=None, - left_index=False, - right_index=False, + left_index: bool = False, + right_index: bool = False, by=None, left_by=None, right_by=None, axis=1, suffixes=("_x", "_y"), - copy=True, + copy: bool = True, fill_method=None, - how="asof", + how: str = "asof", tolerance=None, - allow_exact_matches=True, + allow_exact_matches: bool = True, direction="backward", ): @@ -1757,13 +1773,15 @@ def flip(xs): return func(left_values, right_values, self.allow_exact_matches, tolerance) -def _get_multiindex_indexer(join_keys, index, sort): +def _get_multiindex_indexer(join_keys, index: MultiIndex, sort: bool): # bind `sort` argument fkeys = partial(_factorize_keys, sort=sort) # left & right join labels and num. of levels at each location - rcodes, lcodes, shape = map(list, zip(*map(fkeys, index.levels, join_keys))) + mapped = (fkeys(index.levels[n], join_keys[n]) for n in range(len(index.levels))) + zipped = zip(*mapped) + rcodes, lcodes, shape = [list(x) for x in zipped] if sort: rcodes = list(map(np.take, rcodes, index.codes)) else: @@ -1791,7 +1809,7 @@ def _get_multiindex_indexer(join_keys, index, sort): return libjoin.left_outer_join(lkey, rkey, count, sort=sort) -def _get_single_indexer(join_key, index, sort=False): +def _get_single_indexer(join_key, index, sort: bool = False): left_key, right_key, count = _factorize_keys(join_key, index, sort=sort) left_indexer, right_indexer = libjoin.left_outer_join( @@ -1801,7 +1819,7 @@ def _get_single_indexer(join_key, index, sort=False): return left_indexer, right_indexer -def _left_join_on_index(left_ax, right_ax, join_keys, sort=False): +def _left_join_on_index(left_ax: Index, right_ax: Index, join_keys, sort: bool = False): if len(join_keys) > 1: if not ( (isinstance(right_ax, MultiIndex) and len(join_keys) == right_ax.nlevels) @@ -1915,7 +1933,7 @@ def _factorize_keys(lk, rk, sort=True): return llab, rlab, count -def _sort_labels(uniques, left, right): +def _sort_labels(uniques: np.ndarray, left, right): if not isinstance(uniques, np.ndarray): # tuplesafe uniques = Index(uniques).values @@ -1930,7 +1948,7 @@ def _sort_labels(uniques, left, right): return new_left, new_right -def _get_join_keys(llab, rlab, shape, sort): +def _get_join_keys(llab, rlab, shape, sort: bool): # how many levels can be done without overflow pred = lambda i: not is_int64_overflow_possible(shape[:i]) @@ -1970,7 +1988,7 @@ def _any(x) -> bool: return x is not None and com.any_not_none(*x) -def validate_operand(obj): +def _validate_operand(obj: FrameOrSeries) -> "DataFrame": if isinstance(obj, ABCDataFrame): return obj elif isinstance(obj, ABCSeries): @@ -1985,7 +2003,7 @@ def validate_operand(obj): ) -def _items_overlap_with_suffix(left, lsuffix, right, rsuffix): +def _items_overlap_with_suffix(left: Index, lsuffix, right: Index, rsuffix): """ If two indices overlap, add suffixes to overlapping entries. From 57e1b346c8ca15186f6fde134f83d09db90e695e Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Tue, 12 Nov 2019 15:44:18 -0800 Subject: [PATCH 024/185] CLN: annotations in core.apply (#29477) --- pandas/core/apply.py | 77 ++++++++++++++++++++++++++++++++------------ 1 file changed, 56 insertions(+), 21 deletions(-) diff --git a/pandas/core/apply.py b/pandas/core/apply.py index d9f6bdae288ed..9c5806a3fe945 100644 --- a/pandas/core/apply.py +++ b/pandas/core/apply.py @@ -1,4 +1,6 @@ +import abc import inspect +from typing import TYPE_CHECKING, Iterator, Type import numpy as np @@ -13,14 +15,17 @@ ) from pandas.core.dtypes.generic import ABCSeries +if TYPE_CHECKING: + from pandas import DataFrame, Series, Index + def frame_apply( - obj, + obj: "DataFrame", func, axis=0, - raw=False, + raw: bool = False, result_type=None, - ignore_failures=False, + ignore_failures: bool = False, args=None, kwds=None, ): @@ -28,7 +33,7 @@ def frame_apply( axis = obj._get_axis_number(axis) if axis == 0: - klass = FrameRowApply + klass = FrameRowApply # type: Type[FrameApply] elif axis == 1: klass = FrameColumnApply @@ -43,8 +48,38 @@ def frame_apply( ) -class FrameApply: - def __init__(self, obj, func, raw, result_type, ignore_failures, args, kwds): +class FrameApply(metaclass=abc.ABCMeta): + + # --------------------------------------------------------------- + # Abstract Methods + axis: int + + @property + @abc.abstractmethod + def result_index(self) -> "Index": + pass + + @property + @abc.abstractmethod + def result_columns(self) -> "Index": + pass + + @abc.abstractmethod + def series_generator(self) -> Iterator["Series"]: + pass + + # --------------------------------------------------------------- + + def __init__( + self, + obj: "DataFrame", + func, + raw: bool, + result_type, + ignore_failures: bool, + args, + kwds, + ): self.obj = obj self.raw = raw self.ignore_failures = ignore_failures @@ -76,11 +111,11 @@ def f(x): self.res_columns = None @property - def columns(self): + def columns(self) -> "Index": return self.obj.columns @property - def index(self): + def index(self) -> "Index": return self.obj.index @cache_readonly @@ -88,11 +123,11 @@ def values(self): return self.obj.values @cache_readonly - def dtypes(self): + def dtypes(self) -> "Series": return self.obj.dtypes @property - def agg_axis(self): + def agg_axis(self) -> "Index": return self.obj._get_agg_axis(self.axis) def get_result(self): @@ -127,7 +162,7 @@ def get_result(self): # broadcasting if self.result_type == "broadcast": - return self.apply_broadcast() + return self.apply_broadcast(self.obj) # one axis empty elif not all(self.obj.shape): @@ -191,7 +226,7 @@ def apply_raw(self): else: return self.obj._constructor_sliced(result, index=self.agg_axis) - def apply_broadcast(self, target): + def apply_broadcast(self, target: "DataFrame") -> "DataFrame": result_values = np.empty_like(target.values) # axis which we want to compare compliance @@ -317,19 +352,19 @@ def wrap_results(self): class FrameRowApply(FrameApply): axis = 0 - def apply_broadcast(self): - return super().apply_broadcast(self.obj) + def apply_broadcast(self, target: "DataFrame") -> "DataFrame": + return super().apply_broadcast(target) @property def series_generator(self): return (self.obj._ixs(i, axis=1) for i in range(len(self.columns))) @property - def result_index(self): + def result_index(self) -> "Index": return self.columns @property - def result_columns(self): + def result_columns(self) -> "Index": return self.index def wrap_results_for_axis(self): @@ -351,8 +386,8 @@ def wrap_results_for_axis(self): class FrameColumnApply(FrameApply): axis = 1 - def apply_broadcast(self): - result = super().apply_broadcast(self.obj.T) + def apply_broadcast(self, target: "DataFrame") -> "DataFrame": + result = super().apply_broadcast(target.T) return result.T @property @@ -364,11 +399,11 @@ def series_generator(self): ) @property - def result_index(self): + def result_index(self) -> "Index": return self.index @property - def result_columns(self): + def result_columns(self) -> "Index": return self.columns def wrap_results_for_axis(self): @@ -392,7 +427,7 @@ def wrap_results_for_axis(self): return result - def infer_to_same_shape(self): + def infer_to_same_shape(self) -> "DataFrame": """ infer the results to the same shape as the input object """ results = self.results From 4b3027f79a2d598c485156827ee965c5efcd279d Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Tue, 12 Nov 2019 16:39:18 -0800 Subject: [PATCH 025/185] CLN: type annotations in groupby.grouper, groupby.ops (#29456) * Annotate groupby.ops * annotations, needs debugging * whitespace * types * circular import * fix msot mypy complaints * fix mypy groupings * merge cleanup --- pandas/core/groupby/grouper.py | 23 ++++++++------- pandas/core/groupby/ops.py | 53 +++++++++++++++++++--------------- 2 files changed, 42 insertions(+), 34 deletions(-) diff --git a/pandas/core/groupby/grouper.py b/pandas/core/groupby/grouper.py index 370abe75e1327..e6e3ee62459ca 100644 --- a/pandas/core/groupby/grouper.py +++ b/pandas/core/groupby/grouper.py @@ -119,7 +119,7 @@ def __init__(self, key=None, level=None, freq=None, axis=0, sort=False): def ax(self): return self.grouper - def _get_grouper(self, obj, validate=True): + def _get_grouper(self, obj, validate: bool = True): """ Parameters ---------- @@ -143,17 +143,18 @@ def _get_grouper(self, obj, validate=True): ) return self.binner, self.grouper, self.obj - def _set_grouper(self, obj, sort=False): + def _set_grouper(self, obj: FrameOrSeries, sort: bool = False): """ given an object and the specifications, setup the internal grouper for this particular specification Parameters ---------- - obj : the subject object + obj : Series or DataFrame sort : bool, default False whether the resulting grouper should be sorted """ + assert obj is not None if self.key is not None and self.level is not None: raise ValueError("The Grouper cannot specify both a key and a level!") @@ -211,13 +212,13 @@ def groups(self): def __repr__(self) -> str: attrs_list = ( - "{}={!r}".format(attr_name, getattr(self, attr_name)) + "{name}={val!r}".format(name=attr_name, val=getattr(self, attr_name)) for attr_name in self._attributes if getattr(self, attr_name) is not None ) attrs = ", ".join(attrs_list) cls_name = self.__class__.__name__ - return "{}({})".format(cls_name, attrs) + return "{cls}({attrs})".format(cls=cls_name, attrs=attrs) class Grouping: @@ -372,7 +373,7 @@ def __init__( self.grouper = self.grouper.astype("timedelta64[ns]") def __repr__(self) -> str: - return "Grouping({0})".format(self.name) + return "Grouping({name})".format(name=self.name) def __iter__(self): return iter(self.indices) @@ -433,10 +434,10 @@ def get_grouper( key=None, axis: int = 0, level=None, - sort=True, - observed=False, - mutated=False, - validate=True, + sort: bool = True, + observed: bool = False, + mutated: bool = False, + validate: bool = True, ) -> Tuple[BaseGrouper, List[Hashable], FrameOrSeries]: """ Create and return a BaseGrouper, which is an internal @@ -670,7 +671,7 @@ def is_in_obj(gpr) -> bool: return grouper, exclusions, obj -def _is_label_like(val): +def _is_label_like(val) -> bool: return isinstance(val, (str, tuple)) or (val is not None and is_scalar(val)) diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index ae397277de41c..72dcd1f7224cc 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -36,6 +36,7 @@ ) from pandas.core.dtypes.missing import _maybe_fill, isna +from pandas._typing import FrameOrSeries import pandas.core.algorithms as algorithms from pandas.core.base import SelectionMixin import pandas.core.common as com @@ -89,12 +90,16 @@ def __init__( self._filter_empty_groups = self.compressed = len(groupings) != 1 self.axis = axis - self.groupings = groupings # type: Sequence[grouper.Grouping] + self._groupings = list(groupings) # type: List[grouper.Grouping] self.sort = sort self.group_keys = group_keys self.mutated = mutated self.indexer = indexer + @property + def groupings(self) -> List["grouper.Grouping"]: + return self._groupings + @property def shape(self): return tuple(ping.ngroups for ping in self.groupings) @@ -106,7 +111,7 @@ def __iter__(self): def nkeys(self) -> int: return len(self.groupings) - def get_iterator(self, data, axis=0): + def get_iterator(self, data: FrameOrSeries, axis: int = 0): """ Groupby iterator @@ -120,7 +125,7 @@ def get_iterator(self, data, axis=0): for key, (i, group) in zip(keys, splitter): yield key, group - def _get_splitter(self, data, axis=0): + def _get_splitter(self, data: FrameOrSeries, axis: int = 0) -> "DataSplitter": comp_ids, _, ngroups = self.group_info return get_splitter(data, comp_ids, ngroups, axis=axis) @@ -142,13 +147,13 @@ def _get_group_keys(self): # provide "flattened" iterator for multi-group setting return get_flattened_iterator(comp_ids, ngroups, self.levels, self.codes) - def apply(self, f, data, axis: int = 0): + def apply(self, f, data: FrameOrSeries, axis: int = 0): mutated = self.mutated splitter = self._get_splitter(data, axis=axis) group_keys = self._get_group_keys() result_values = None - sdata = splitter._get_sorted_data() + sdata = splitter._get_sorted_data() # type: FrameOrSeries if sdata.ndim == 2 and np.any(sdata.dtypes.apply(is_extension_array_dtype)): # calling splitter.fast_apply will raise TypeError via apply_frame_axis0 # if we pass EA instead of ndarray @@ -157,7 +162,7 @@ def apply(self, f, data, axis: int = 0): elif ( com.get_callable_name(f) not in base.plotting_methods - and hasattr(splitter, "fast_apply") + and isinstance(splitter, FrameSplitter) and axis == 0 # with MultiIndex, apply_frame_axis0 would raise InvalidApply # TODO: can we make this check prettier? @@ -229,8 +234,7 @@ def names(self): def size(self) -> Series: """ - Compute group sizes - + Compute group sizes. """ ids, _, ngroup = self.group_info ids = ensure_platform_int(ids) @@ -292,7 +296,7 @@ def reconstructed_codes(self) -> List[np.ndarray]: return decons_obs_group_ids(comp_ids, obs_ids, self.shape, codes, xnull=True) @cache_readonly - def result_index(self): + def result_index(self) -> Index: if not self.compressed and len(self.groupings) == 1: return self.groupings[0].result_index.rename(self.names[0]) @@ -629,7 +633,7 @@ def agg_series(self, obj: Series, func): raise return self._aggregate_series_pure_python(obj, func) - def _aggregate_series_fast(self, obj, func): + def _aggregate_series_fast(self, obj: Series, func): # At this point we have already checked that # - obj.index is not a MultiIndex # - obj is backed by an ndarray, not ExtensionArray @@ -648,7 +652,7 @@ def _aggregate_series_fast(self, obj, func): result, counts = grouper.get_result() return result, counts - def _aggregate_series_pure_python(self, obj, func): + def _aggregate_series_pure_python(self, obj: Series, func): group_index, _, ngroups = self.group_info @@ -705,7 +709,12 @@ class BinGrouper(BaseGrouper): """ def __init__( - self, bins, binlabels, filter_empty=False, mutated=False, indexer=None + self, + bins, + binlabels, + filter_empty: bool = False, + mutated: bool = False, + indexer=None, ): self.bins = ensure_int64(bins) self.binlabels = ensure_index(binlabels) @@ -739,7 +748,7 @@ def _get_grouper(self): """ return self - def get_iterator(self, data: NDFrame, axis: int = 0): + def get_iterator(self, data: FrameOrSeries, axis: int = 0): """ Groupby iterator @@ -811,11 +820,9 @@ def names(self): return [self.binlabels.name] @property - def groupings(self): - from pandas.core.groupby.grouper import Grouping - + def groupings(self) -> "List[grouper.Grouping]": return [ - Grouping(lvl, lvl, in_axis=False, level=None, name=name) + grouper.Grouping(lvl, lvl, in_axis=False, level=None, name=name) for lvl, name in zip(self.levels, self.names) ] @@ -856,7 +863,7 @@ def _is_indexed_like(obj, axes) -> bool: class DataSplitter: - def __init__(self, data, labels, ngroups, axis: int = 0): + def __init__(self, data: FrameOrSeries, labels, ngroups: int, axis: int = 0): self.data = data self.labels = ensure_int64(labels) self.ngroups = ngroups @@ -887,15 +894,15 @@ def __iter__(self): for i, (start, end) in enumerate(zip(starts, ends)): yield i, self._chop(sdata, slice(start, end)) - def _get_sorted_data(self): + def _get_sorted_data(self) -> FrameOrSeries: return self.data.take(self.sort_idx, axis=self.axis) - def _chop(self, sdata, slice_obj: slice): + def _chop(self, sdata, slice_obj: slice) -> NDFrame: raise AbstractMethodError(self) class SeriesSplitter(DataSplitter): - def _chop(self, sdata, slice_obj: slice): + def _chop(self, sdata: Series, slice_obj: slice) -> Series: return sdata._get_values(slice_obj) @@ -907,14 +914,14 @@ def fast_apply(self, f, names): sdata = self._get_sorted_data() return libreduction.apply_frame_axis0(sdata, f, names, starts, ends) - def _chop(self, sdata, slice_obj: slice): + def _chop(self, sdata: DataFrame, slice_obj: slice) -> DataFrame: if self.axis == 0: return sdata.iloc[slice_obj] else: return sdata._slice(slice_obj, axis=1) -def get_splitter(data: NDFrame, *args, **kwargs): +def get_splitter(data: FrameOrSeries, *args, **kwargs) -> DataSplitter: if isinstance(data, Series): klass = SeriesSplitter # type: Type[DataSplitter] else: From 0cbf3998b300e1480223717b199ebb352f092450 Mon Sep 17 00:00:00 2001 From: Ryan Nazareth Date: Wed, 13 Nov 2019 01:27:53 +0000 Subject: [PATCH 026/185] TST: Add tests for single level indexing with loc(axis=1) (#29519) --- .../tests/indexing/multiindex/test_slice.py | 38 +++++++++++++++++++ 1 file changed, 38 insertions(+) diff --git a/pandas/tests/indexing/multiindex/test_slice.py b/pandas/tests/indexing/multiindex/test_slice.py index f1f11285696f9..f279b5517c3f6 100644 --- a/pandas/tests/indexing/multiindex/test_slice.py +++ b/pandas/tests/indexing/multiindex/test_slice.py @@ -492,6 +492,44 @@ def test_loc_axis_arguments(self): with pytest.raises(ValueError): df.loc(axis="foo")[:, :, ["C1", "C3"]] + def test_loc_axis_single_level_multi_col_indexing_multiindex_col_df(self): + + # GH29519 + df = pd.DataFrame( + np.arange(27).reshape(3, 9), + columns=pd.MultiIndex.from_product( + [["a1", "a2", "a3"], ["b1", "b2", "b3"]] + ), + ) + result = df.loc(axis=1)["a1":"a2"] + expected = df.iloc[:, :-3] + + tm.assert_frame_equal(result, expected) + + def test_loc_axis_single_level_single_col_indexing_multiindex_col_df(self): + + # GH29519 + df = pd.DataFrame( + np.arange(27).reshape(3, 9), + columns=pd.MultiIndex.from_product( + [["a1", "a2", "a3"], ["b1", "b2", "b3"]] + ), + ) + result = df.loc(axis=1)["a1"] + expected = df.iloc[:, :3] + expected.columns = ["b1", "b2", "b3"] + + tm.assert_frame_equal(result, expected) + + def test_loc_ax_single_level_indexer_simple_df(self): + + # GH29519 + # test single level indexing on single index column data frame + df = pd.DataFrame(np.arange(9).reshape(3, 3), columns=["a", "b", "c"]) + result = df.loc(axis=1)["a"] + expected = pd.Series(np.array([0, 3, 6]), name="a") + tm.assert_series_equal(result, expected) + def test_per_axis_per_level_setitem(self): # test index maker From a562c222333d265bc921122a5561d356aadc7a70 Mon Sep 17 00:00:00 2001 From: AyowoleT Date: Wed, 13 Nov 2019 02:28:53 +0100 Subject: [PATCH 027/185] CI: Check for whitespaces before class (#28489) --- ci/code_checks.sh | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index 07c99b39e83e8..ab7bd7895a596 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -190,6 +190,10 @@ if [[ -z "$CHECK" || "$CHECK" == "patterns" ]]; then invgrep -R --include="*.rst" ".. ipython ::" doc/source RET=$(($RET + $?)) ; echo $MSG "DONE" +    MSG='Check for extra blank lines after the class definition' ; echo $MSG +    invgrep -R --include="*.py" --include="*.pyx" -E 'class.*:\n\n( )+"""' . +    RET=$(($RET + $?)) ; echo $MSG "DONE" + MSG='Check that no file in the repo contains trailing whitespaces' ; echo $MSG set -o pipefail if [[ "$AZURE" == "true" ]]; then From f1cb42cfb8aace1fea8c731489c5478d0bf21b90 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Tue, 12 Nov 2019 17:29:21 -0800 Subject: [PATCH 028/185] REF: pre-allocate results in libreduction (#29550) --- pandas/_libs/reduction.pyx | 44 +++++++++++++++++++------------------- 1 file changed, 22 insertions(+), 22 deletions(-) diff --git a/pandas/_libs/reduction.pyx b/pandas/_libs/reduction.pyx index fa9c12777eb5b..a150138f904ef 100644 --- a/pandas/_libs/reduction.pyx +++ b/pandas/_libs/reduction.pyx @@ -18,15 +18,13 @@ cimport pandas._libs.util as util from pandas._libs.lib import maybe_convert_objects -cdef _get_result_array(object obj, Py_ssize_t size, Py_ssize_t cnt): +cdef _check_result_array(object obj, Py_ssize_t cnt): if (util.is_array(obj) or (isinstance(obj, list) and len(obj) == cnt) or getattr(obj, 'shape', None) == (cnt,)): raise ValueError('Function does not reduce') - return np.empty(size, dtype='O') - cdef bint _is_sparse_array(object obj): # TODO can be removed one SparseArray.values is removed (GH26421) @@ -116,6 +114,9 @@ cdef class Reducer: has_index = self.index is not None incr = self.increment + result = np.empty(self.nresults, dtype='O') + it = PyArray_IterNew(result) + try: for i in range(self.nresults): @@ -158,10 +159,9 @@ cdef class Reducer: and util.is_array(res.values)): res = res.values if i == 0: - result = _get_result_array(res, - self.nresults, - len(self.dummy)) - it = PyArray_IterNew(result) + # On the first pass, we check the output shape to see + # if this looks like a reduction. + _check_result_array(res, len(self.dummy)) PyArray_SETITEM(result, PyArray_ITER_DATA(it), res) chunk.data = chunk.data + self.increment @@ -170,9 +170,7 @@ cdef class Reducer: # so we don't free the wrong memory chunk.data = dummy_buf - if result.dtype == np.object_: - result = maybe_convert_objects(result) - + result = maybe_convert_objects(result) return result @@ -275,6 +273,8 @@ cdef class SeriesBinGrouper(_BaseGrouper): vslider = Slider(self.arr, self.dummy_arr) islider = Slider(self.index, self.dummy_index) + result = np.empty(self.ngroups, dtype='O') + try: for i in range(self.ngroups): group_size = counts[i] @@ -289,10 +289,11 @@ cdef class SeriesBinGrouper(_BaseGrouper): res = self.f(cached_typ) res = _extract_result(res) if not initialized: + # On the first pass, we check the output shape to see + # if this looks like a reduction. initialized = 1 - result = _get_result_array(res, - self.ngroups, - len(self.dummy_arr)) + _check_result_array(res, len(self.dummy_arr)) + result[i] = res islider.advance(group_size) @@ -303,9 +304,7 @@ cdef class SeriesBinGrouper(_BaseGrouper): islider.reset() vslider.reset() - if result.dtype == np.object_: - result = maybe_convert_objects(result) - + result = maybe_convert_objects(result) return result, counts @@ -368,6 +367,8 @@ cdef class SeriesGrouper(_BaseGrouper): vslider = Slider(self.arr, self.dummy_arr) islider = Slider(self.index, self.dummy_index) + result = np.empty(self.ngroups, dtype='O') + try: for i in range(n): group_size += 1 @@ -391,10 +392,10 @@ cdef class SeriesGrouper(_BaseGrouper): res = self.f(cached_typ) res = _extract_result(res) if not initialized: + # On the first pass, we check the output shape to see + # if this looks like a reduction. initialized = 1 - result = _get_result_array(res, - self.ngroups, - len(self.dummy_arr)) + _check_result_array(res, len(self.dummy_arr)) result[lab] = res counts[lab] = group_size @@ -410,10 +411,9 @@ cdef class SeriesGrouper(_BaseGrouper): # We check for empty series in the constructor, so should always # have result initialized by this point. - assert result is not None, "`result` has not been assigned." + assert initialized, "`result` has not been initialized." - if result.dtype == np.object_: - result = maybe_convert_objects(result) + result = maybe_convert_objects(result) return result, counts From e5c3907bd68b2817660f6e3bf3b3386ff640d1cf Mon Sep 17 00:00:00 2001 From: ganevgv Date: Wed, 13 Nov 2019 01:48:01 +0000 Subject: [PATCH 029/185] TST: add test for .loc indexing Index type preservation (#29533) --- pandas/tests/frame/test_indexing.py | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/pandas/tests/frame/test_indexing.py b/pandas/tests/frame/test_indexing.py index e215c90d2eb04..61bf91d3018eb 100644 --- a/pandas/tests/frame/test_indexing.py +++ b/pandas/tests/frame/test_indexing.py @@ -3856,3 +3856,28 @@ def test_functions_no_warnings(self): df["group"] = pd.cut( df.value, range(0, 105, 10), right=False, labels=labels ) + + def test_loc_indexing_preserves_index_category_dtype(self): + # GH 15166 + df = DataFrame( + data=np.arange(2, 22, 2), + index=pd.MultiIndex( + levels=[pd.CategoricalIndex(["a", "b"]), range(10)], + codes=[[0] * 5 + [1] * 5, range(10)], + names=["Index1", "Index2"], + ), + ) + + expected = pd.CategoricalIndex( + ["a", "b"], + categories=["a", "b"], + ordered=False, + name="Index1", + dtype="category", + ) + + result = df.index.levels[0] + tm.assert_index_equal(result, expected) + + result = df.loc[["a"]].index.levels[0] + tm.assert_index_equal(result, expected) From 719ab02fddb0fa49512c6d5ff6f0707067aba032 Mon Sep 17 00:00:00 2001 From: Louis Huynh <12685195+louishuynh@users.noreply.github.com> Date: Wed, 13 Nov 2019 01:57:18 +0000 Subject: [PATCH 030/185] TST: Test type issue fix in empty groupby from DataFrame with categorical (#29355) --- pandas/tests/groupby/test_categorical.py | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py index 22a23407b2521..a187781ea214c 100644 --- a/pandas/tests/groupby/test_categorical.py +++ b/pandas/tests/groupby/test_categorical.py @@ -781,6 +781,22 @@ def test_categorical_no_compress(): tm.assert_numpy_array_equal(result, exp) +def test_groupby_empty_with_category(): + # GH-9614 + # test fix for when group by on None resulted in + # coercion of dtype categorical -> float + df = pd.DataFrame( + {"A": [None] * 3, "B": pd.Categorical(["train", "train", "test"])} + ) + result = df.groupby("A").first()["B"] + expected = pd.Series( + pd.Categorical([], categories=["test", "train"]), + index=pd.Series([], dtype="object", name="A"), + name="B", + ) + tm.assert_series_equal(result, expected) + + def test_sort(): # http://stackoverflow.com/questions/23814368/sorting-pandas- From 6b62e50435a81dbfa692660b72e9929d2cb65146 Mon Sep 17 00:00:00 2001 From: Jiang Yue <35633013+jiangyue12392@users.noreply.github.com> Date: Wed, 13 Nov 2019 10:06:32 +0800 Subject: [PATCH 031/185] ENH: Use IntergerArray to avoid forced conversion from integer to float (#27335) --- pandas/_libs/lib.pyx | 53 ++++++++++++++++++++++++--- pandas/core/frame.py | 6 +-- pandas/core/internals/construction.py | 1 + pandas/tests/dtypes/test_inference.py | 15 ++++++++ 4 files changed, 67 insertions(+), 8 deletions(-) diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 7d65cb52bce1e..c1fd46f4bba9e 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -971,6 +971,7 @@ cdef class Seen: bint nat_ # seen nat bint bool_ # seen_bool bint null_ # seen_null + bint nan_ # seen_np.nan bint uint_ # seen_uint (unsigned integer) bint sint_ # seen_sint (signed integer) bint float_ # seen_float @@ -995,6 +996,7 @@ cdef class Seen: self.nat_ = 0 self.bool_ = 0 self.null_ = 0 + self.nan_ = 0 self.uint_ = 0 self.sint_ = 0 self.float_ = 0 @@ -1953,10 +1955,37 @@ def maybe_convert_numeric(ndarray[object] values, set na_values, @cython.wraparound(False) def maybe_convert_objects(ndarray[object] objects, bint try_float=0, bint safe=0, bint convert_datetime=0, - bint convert_timedelta=0): + bint convert_timedelta=0, + bint convert_to_nullable_integer=0): """ Type inference function-- convert object array to proper dtype + + Parameters + ---------- + values : ndarray + Array of object elements to convert. + try_float : bool, default False + If an array-like object contains only float or NaN values is + encountered, whether to convert and return an array of float dtype. + safe : bool, default False + Whether to upcast numeric type (e.g. int cast to float). If set to + True, no upcasting will be performed. + convert_datetime : bool, default False + If an array-like object contains only datetime values or NaT is + encountered, whether to convert and return an array of M8[ns] dtype. + convert_timedelta : bool, default False + If an array-like object contains only timedelta values or NaT is + encountered, whether to convert and return an array of m8[ns] dtype. + convert_to_nullable_integer : bool, default False + If an array-like object contains only interger values (and NaN) is + encountered, whether to convert and return an IntegerArray. + + Returns + ------- + array : array of converted object values to more specific dtypes if + pplicable """ + cdef: Py_ssize_t i, n ndarray[float64_t] floats @@ -1977,6 +2006,7 @@ def maybe_convert_objects(ndarray[object] objects, bint try_float=0, ints = np.empty(n, dtype='i8') uints = np.empty(n, dtype='u8') bools = np.empty(n, dtype=np.uint8) + mask = np.full(n, False) if convert_datetime: datetimes = np.empty(n, dtype='M8[ns]') @@ -1994,6 +2024,7 @@ def maybe_convert_objects(ndarray[object] objects, bint try_float=0, if val is None: seen.null_ = 1 floats[i] = complexes[i] = fnan + mask[i] = True elif val is NaT: seen.nat_ = 1 if convert_datetime: @@ -2003,6 +2034,10 @@ def maybe_convert_objects(ndarray[object] objects, bint try_float=0, if not (convert_datetime or convert_timedelta): seen.object_ = 1 break + elif val is np.nan: + seen.nan_ = 1 + mask[i] = True + floats[i] = complexes[i] = val elif util.is_bool_object(val): seen.bool_ = 1 bools[i] = val @@ -2084,11 +2119,19 @@ def maybe_convert_objects(ndarray[object] objects, bint try_float=0, if not seen.object_: if not safe: - if seen.null_: + if seen.null_ or seen.nan_: if seen.is_float_or_complex: if seen.complex_: return complexes - elif seen.float_ or seen.int_: + elif seen.float_: + return floats + elif seen.int_: + if convert_to_nullable_integer: + from pandas.core.arrays import IntegerArray + return IntegerArray(ints, mask) + else: + return floats + elif seen.nan_: return floats else: if not seen.bool_: @@ -2127,7 +2170,7 @@ def maybe_convert_objects(ndarray[object] objects, bint try_float=0, if seen.complex_: if not seen.int_: return complexes - elif seen.float_: + elif seen.float_ or seen.nan_: if not seen.int_: return floats else: @@ -2151,7 +2194,7 @@ def maybe_convert_objects(ndarray[object] objects, bint try_float=0, if seen.complex_: if not seen.int_: return complexes - elif seen.float_: + elif seen.float_ or seen.nan_: if not seen.int_: return floats elif seen.int_: diff --git a/pandas/core/frame.py b/pandas/core/frame.py index ebee8b10896be..39e325a7e047b 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -857,9 +857,9 @@ def style(self): ... index=['panda', 'polar', 'koala']) >>> df species population - panda bear 1864 - polar bear 22000 - koala marsupial 80000 + panda bear 1864 + polar bear 22000 + koala marsupial 80000 >>> for label, content in df.items(): ... print('label:', label) ... print('content:', content, sep='\n') diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py index 05a2803b3fc2f..bb62db431ac73 100644 --- a/pandas/core/internals/construction.py +++ b/pandas/core/internals/construction.py @@ -548,6 +548,7 @@ def _list_of_dict_to_arrays(data, columns, coerce_float=False, dtype=None): tuple arrays, columns """ + if columns is None: gen = (list(x.keys()) for x in data) types = (dict, OrderedDict) if PY36 else OrderedDict diff --git a/pandas/tests/dtypes/test_inference.py b/pandas/tests/dtypes/test_inference.py index 62fb118f719e3..0408c78ac1536 100644 --- a/pandas/tests/dtypes/test_inference.py +++ b/pandas/tests/dtypes/test_inference.py @@ -51,6 +51,7 @@ Timestamp, isna, ) +from pandas.core.arrays import IntegerArray import pandas.util.testing as tm @@ -552,6 +553,20 @@ def test_maybe_convert_objects_datetime(self): out = lib.maybe_convert_objects(arr, convert_datetime=1, convert_timedelta=1) tm.assert_numpy_array_equal(out, exp) + @pytest.mark.parametrize( + "exp", + [ + IntegerArray(np.array([2, 0], dtype="i8"), np.array([False, True])), + IntegerArray(np.array([2, 0], dtype="int64"), np.array([False, True])), + ], + ) + def test_maybe_convert_objects_nullable_integer(self, exp): + # GH27335 + arr = np.array([2, np.NaN], dtype=object) + result = lib.maybe_convert_objects(arr, convert_to_nullable_integer=1) + + tm.assert_extension_array_equal(result, exp) + def test_mixed_dtypes_remain_object_array(self): # GH14956 array = np.array([datetime(2015, 1, 1, tzinfo=pytz.utc), 1], dtype=object) From cd04be2b25a54bc98d9c2743747b1ff143d52fc9 Mon Sep 17 00:00:00 2001 From: Kaiqi Dong Date: Wed, 13 Nov 2019 03:13:05 +0100 Subject: [PATCH 032/185] TST: Add tests for MultiIndex columns cases in aggregate relabelling (#29504) --- .../tests/groupby/aggregate/test_aggregate.py | 74 +++++++++++++++++++ 1 file changed, 74 insertions(+) diff --git a/pandas/tests/groupby/aggregate/test_aggregate.py b/pandas/tests/groupby/aggregate/test_aggregate.py index c03ffe317083c..4313b52798c6e 100644 --- a/pandas/tests/groupby/aggregate/test_aggregate.py +++ b/pandas/tests/groupby/aggregate/test_aggregate.py @@ -495,6 +495,80 @@ def test_mangled(self): tm.assert_frame_equal(result, expected) +@pytest.mark.parametrize( + "agg_col1, agg_col2, agg_col3, agg_result1, agg_result2, agg_result3", + [ + ( + (("y", "A"), "max"), + (("y", "A"), np.min), + (("y", "B"), "mean"), + [1, 3], + [0, 2], + [5.5, 7.5], + ), + ( + (("y", "A"), lambda x: max(x)), + (("y", "A"), lambda x: 1), + (("y", "B"), "mean"), + [1, 3], + [1, 1], + [5.5, 7.5], + ), + ( + pd.NamedAgg(("y", "A"), "max"), + pd.NamedAgg(("y", "B"), np.mean), + pd.NamedAgg(("y", "A"), lambda x: 1), + [1, 3], + [5.5, 7.5], + [1, 1], + ), + ], +) +def test_agg_relabel_multiindex_column( + agg_col1, agg_col2, agg_col3, agg_result1, agg_result2, agg_result3 +): + # GH 29422, add tests for multiindex column cases + df = DataFrame( + {"group": ["a", "a", "b", "b"], "A": [0, 1, 2, 3], "B": [5, 6, 7, 8]} + ) + df.columns = pd.MultiIndex.from_tuples([("x", "group"), ("y", "A"), ("y", "B")]) + idx = pd.Index(["a", "b"], name=("x", "group")) + + result = df.groupby(("x", "group")).agg(a_max=(("y", "A"), "max")) + expected = DataFrame({"a_max": [1, 3]}, index=idx) + tm.assert_frame_equal(result, expected) + + result = df.groupby(("x", "group")).agg( + col_1=agg_col1, col_2=agg_col2, col_3=agg_col3 + ) + expected = DataFrame( + {"col_1": agg_result1, "col_2": agg_result2, "col_3": agg_result3}, index=idx + ) + tm.assert_frame_equal(result, expected) + + +def test_agg_relabel_multiindex_raises_not_exist(): + # GH 29422, add test for raises senario when aggregate column does not exist + df = DataFrame( + {"group": ["a", "a", "b", "b"], "A": [0, 1, 2, 3], "B": [5, 6, 7, 8]} + ) + df.columns = pd.MultiIndex.from_tuples([("x", "group"), ("y", "A"), ("y", "B")]) + + with pytest.raises(KeyError, match="does not exist"): + df.groupby(("x", "group")).agg(a=(("Y", "a"), "max")) + + +def test_agg_relabel_multiindex_raises_duplicate(): + # GH29422, add test for raises senario when getting duplicates + df = DataFrame( + {"group": ["a", "a", "b", "b"], "A": [0, 1, 2, 3], "B": [5, 6, 7, 8]} + ) + df.columns = pd.MultiIndex.from_tuples([("x", "group"), ("y", "A"), ("y", "B")]) + + with pytest.raises(SpecificationError, match="Function names"): + df.groupby(("x", "group")).agg(a=(("y", "A"), "min"), b=(("y", "A"), "min")) + + def myfunc(s): return np.percentile(s, q=0.90) From 0e1d56a63af752d78ef89b50ffd9c1acf9a351d8 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Tue, 12 Nov 2019 19:35:34 -0800 Subject: [PATCH 033/185] Update MultiIndex checks (#29494) --- pandas/_libs/reduction.pyx | 6 +++--- pandas/core/groupby/ops.py | 21 ++++++++++----------- 2 files changed, 13 insertions(+), 14 deletions(-) diff --git a/pandas/_libs/reduction.pyx b/pandas/_libs/reduction.pyx index a150138f904ef..11dc2d04bb74e 100644 --- a/pandas/_libs/reduction.pyx +++ b/pandas/_libs/reduction.pyx @@ -493,8 +493,8 @@ def apply_frame_axis0(object frame, object f, object names, object piece dict item_cache - if frame.index._has_complex_internals: - raise InvalidApply('Cannot modify frame index internals') + # We have already checked that we don't have a MultiIndex before calling + assert frame.index.nlevels == 1 results = [] @@ -625,7 +625,7 @@ def compute_reduction(arr, f, axis=0, dummy=None, labels=None): if labels is not None: # Caller is responsible for ensuring we don't have MultiIndex - assert not labels._has_complex_internals + assert labels.nlevels == 1 # pass as an ndarray/ExtensionArray labels = labels._values diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index 72dcd1f7224cc..390fe60ea02b4 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -164,27 +164,26 @@ def apply(self, f, data: FrameOrSeries, axis: int = 0): com.get_callable_name(f) not in base.plotting_methods and isinstance(splitter, FrameSplitter) and axis == 0 - # with MultiIndex, apply_frame_axis0 would raise InvalidApply - # TODO: can we make this check prettier? - and not sdata.index._has_complex_internals + # apply_frame_axis0 doesn't allow MultiIndex + and not isinstance(sdata.index, MultiIndex) ): try: result_values, mutated = splitter.fast_apply(f, group_keys) - # If the fast apply path could be used we can return here. - # Otherwise we need to fall back to the slow implementation. - if len(result_values) == len(group_keys): - return group_keys, result_values, mutated - except libreduction.InvalidApply as err: - # Cannot fast apply on MultiIndex (_has_complex_internals). - # This Exception is also raised if `f` triggers an exception + # This Exception is raised if `f` triggers an exception # but it is preferable to raise the exception in Python. if "Let this error raise above us" not in str(err): # TODO: can we infer anything about whether this is # worth-retrying in pure-python? raise + else: + # If the fast apply path could be used we can return here. + # Otherwise we need to fall back to the slow implementation. + if len(result_values) == len(group_keys): + return group_keys, result_values, mutated + for key, (i, group) in zip(group_keys, splitter): object.__setattr__(group, "name", key) @@ -619,7 +618,7 @@ def agg_series(self, obj: Series, func): # TODO: is the datetime64tz case supposed to go through here? return self._aggregate_series_pure_python(obj, func) - elif obj.index._has_complex_internals: + elif isinstance(obj.index, MultiIndex): # MultiIndex; Pre-empt TypeError in _aggregate_series_fast return self._aggregate_series_pure_python(obj, func) From 2ec7f2f279d770b286c9c7679ba7ad0e2f14dcbe Mon Sep 17 00:00:00 2001 From: Brian Wignall Date: Tue, 12 Nov 2019 22:36:45 -0500 Subject: [PATCH 034/185] Fix small typos (#29584) --- doc/source/user_guide/integer_na.rst | 2 +- doc/source/whatsnew/v0.20.0.rst | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/doc/source/user_guide/integer_na.rst b/doc/source/user_guide/integer_na.rst index 97b9c2f95dc50..db6869a3804bd 100644 --- a/doc/source/user_guide/integer_na.rst +++ b/doc/source/user_guide/integer_na.rst @@ -63,7 +63,7 @@ up with a ``float64`` dtype Series: pd.Series([1, 2, np.nan]) Operations involving an integer array will behave similar to NumPy arrays. -Missing values will be propagated, and and the data will be coerced to another +Missing values will be propagated, and the data will be coerced to another dtype if needed. .. ipython:: python diff --git a/doc/source/whatsnew/v0.20.0.rst b/doc/source/whatsnew/v0.20.0.rst index c7278d5a47ba6..e7dc6150ffcb1 100644 --- a/doc/source/whatsnew/v0.20.0.rst +++ b/doc/source/whatsnew/v0.20.0.rst @@ -33,7 +33,7 @@ Check the :ref:`API Changes ` and :ref:`deprecations .. note:: - This is a combined release for 0.20.0 and and 0.20.1. + This is a combined release for 0.20.0 and 0.20.1. Version 0.20.1 contains one additional change for backwards-compatibility with downstream projects using pandas' ``utils`` routines. (:issue:`16250`) .. contents:: What's new in v0.20.0 From ded1d6e2d326b0d9b953fff70d544098b888cb97 Mon Sep 17 00:00:00 2001 From: MomIsBestFriend <50263213+MomIsBestFriend@users.noreply.github.com> Date: Wed, 13 Nov 2019 14:28:33 +0200 Subject: [PATCH 035/185] Removed the commented code (#29592) --- pandas/_libs/parsers.pyx | 5 ----- 1 file changed, 5 deletions(-) diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx index a2c7d0da5b4a8..bbea66542a953 100644 --- a/pandas/_libs/parsers.pyx +++ b/pandas/_libs/parsers.pyx @@ -843,11 +843,6 @@ cdef class TextReader: passed_count = len(header[0]) - # if passed_count > field_count: - # raise ParserError('Column names have %d fields, ' - # 'data has %d fields' - # % (passed_count, field_count)) - if (self.has_usecols and self.allow_leading_cols and not callable(self.usecols)): nuse = len(self.usecols) From 0e6efcf807b63e0774b8af78cd96808fca23c186 Mon Sep 17 00:00:00 2001 From: Gina Date: Wed, 13 Nov 2019 10:44:18 -0600 Subject: [PATCH 036/185] Update FUNDING.yml (#29601) add NumFOCUS github sponsors button --- .github/FUNDING.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/FUNDING.yml b/.github/FUNDING.yml index 944ce9b4fb1f6..27dfded808b95 100644 --- a/.github/FUNDING.yml +++ b/.github/FUNDING.yml @@ -1,2 +1,3 @@ custom: https://pandas.pydata.org/donate.html +github: [numfocus] tidelift: pypi/pandas From 2064876e271358f178ad806b3a97d6bc0b37e5ca Mon Sep 17 00:00:00 2001 From: Jeremy Schendel Date: Wed, 13 Nov 2019 09:46:57 -0700 Subject: [PATCH 037/185] BUG: Fix DatetimeIndex.strftime with NaT present (#29583) --- doc/source/whatsnew/v1.0.0.rst | 2 +- pandas/core/arrays/datetimelike.py | 3 ++- pandas/tests/arrays/test_datetimelike.py | 20 ++++++++++++++++++-- pandas/tests/series/test_datetime_values.py | 16 +++++++++++++++- 4 files changed, 36 insertions(+), 5 deletions(-) diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index 93f59f6a6a614..90606fb61ada8 100644 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -314,7 +314,7 @@ Datetimelike - Bug in :func:`pandas.core.groupby.generic.SeriesGroupBy.apply` raising ``ValueError`` when a column in the original DataFrame is a datetime and the column labels are not standard integers (:issue:`28247`) - Bug in :func:`pandas._config.localization.get_locales` where the ``locales -a`` encodes the locales list as windows-1252 (:issue:`23638`, :issue:`24760`, :issue:`27368`) - Bug in :meth:`Series.var` failing to raise ``TypeError`` when called with ``timedelta64[ns]`` dtype (:issue:`28289`) -- +- Bug in :meth:`DatetimeIndex.strftime` and :meth:`Series.dt.strftime` where ``NaT`` was converted to the string ``'NaT'`` instead of ``np.nan`` (:issue:`29578`) Timedelta ^^^^^^^^^ diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index 497f33f0f4704..287ff9d618501 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -179,7 +179,8 @@ def strftime(self, date_format): 'March 10, 2018, 09:00:02 AM'], dtype='object') """ - return self._format_native_types(date_format=date_format).astype(object) + result = self._format_native_types(date_format=date_format, na_rep=np.nan) + return result.astype(object) class TimelikeOps: diff --git a/pandas/tests/arrays/test_datetimelike.py b/pandas/tests/arrays/test_datetimelike.py index 117a19acbfc3a..3bacd560e75cf 100644 --- a/pandas/tests/arrays/test_datetimelike.py +++ b/pandas/tests/arrays/test_datetimelike.py @@ -473,7 +473,15 @@ def test_strftime(self, datetime_index): arr = DatetimeArray(datetime_index) result = arr.strftime("%Y %b") - expected = np.array(datetime_index.strftime("%Y %b")) + expected = np.array([ts.strftime("%Y %b") for ts in arr], dtype=object) + tm.assert_numpy_array_equal(result, expected) + + def test_strftime_nat(self): + # GH 29578 + arr = DatetimeArray(DatetimeIndex(["2019-01-01", pd.NaT])) + + result = arr.strftime("%Y-%m-%d") + expected = np.array(["2019-01-01", np.nan], dtype=object) tm.assert_numpy_array_equal(result, expected) @@ -679,7 +687,15 @@ def test_strftime(self, period_index): arr = PeriodArray(period_index) result = arr.strftime("%Y") - expected = np.array(period_index.strftime("%Y")) + expected = np.array([per.strftime("%Y") for per in arr], dtype=object) + tm.assert_numpy_array_equal(result, expected) + + def test_strftime_nat(self): + # GH 29578 + arr = PeriodArray(PeriodIndex(["2019-01-01", pd.NaT], dtype="period[D]")) + + result = arr.strftime("%Y-%m-%d") + expected = np.array(["2019-01-01", np.nan], dtype=object) tm.assert_numpy_array_equal(result, expected) diff --git a/pandas/tests/series/test_datetime_values.py b/pandas/tests/series/test_datetime_values.py index 9304e1c4fc157..0be3c729cff91 100644 --- a/pandas/tests/series/test_datetime_values.py +++ b/pandas/tests/series/test_datetime_values.py @@ -504,7 +504,7 @@ def test_strftime(self): s.iloc[0] = pd.NaT result = s.dt.strftime("%Y/%m/%d") expected = Series( - ["NaT", "2013/01/02", "2013/01/03", "2013/01/04", "2013/01/05"] + [np.nan, "2013/01/02", "2013/01/03", "2013/01/04", "2013/01/05"] ) tm.assert_series_equal(result, expected) @@ -554,6 +554,20 @@ def test_strftime(self): ) tm.assert_series_equal(result, expected) + @pytest.mark.parametrize( + "data", + [ + DatetimeIndex(["2019-01-01", pd.NaT]), + PeriodIndex(["2019-01-01", pd.NaT], dtype="period[D]"), + ], + ) + def test_strftime_nat(self, data): + # GH 29578 + s = Series(data) + result = s.dt.strftime("%Y-%m-%d") + expected = Series(["2019-01-01", np.nan]) + tm.assert_series_equal(result, expected) + def test_valid_dt_with_missing_values(self): from datetime import date, time From b68899f7ed55a0f7260cb265629d30437c05fbb1 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Wed, 13 Nov 2019 08:47:28 -0800 Subject: [PATCH 038/185] TYP: annotate _data (#29580) --- pandas/core/indexes/base.py | 4 +--- pandas/core/indexes/datetimelike.py | 6 +++--- pandas/core/indexes/period.py | 2 +- 3 files changed, 5 insertions(+), 7 deletions(-) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 9748342c86f31..68736935ed36d 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -230,7 +230,7 @@ def _outer_indexer(self, left, right): return libjoin.outer_join_indexer(left, right) _typ = "index" - _data = None + _data: Union[ExtensionArray, np.ndarray] _id = None name = None _comparables = ["name"] @@ -653,8 +653,6 @@ def __len__(self) -> int: """ Return the length of the Index. """ - # Assertion needed for mypy, see GH#29475 - assert self._data is not None return len(self._data) def __array__(self, dtype=None): diff --git a/pandas/core/indexes/datetimelike.py b/pandas/core/indexes/datetimelike.py index bf89bbbdf2b79..f694b85f1ca5d 100644 --- a/pandas/core/indexes/datetimelike.py +++ b/pandas/core/indexes/datetimelike.py @@ -27,7 +27,7 @@ from pandas.core import algorithms, ops from pandas.core.accessor import PandasDelegate -from pandas.core.arrays import ExtensionOpsMixin +from pandas.core.arrays import ExtensionArray, ExtensionOpsMixin from pandas.core.arrays.datetimelike import ( DatetimeLikeArrayMixin, _ensure_datetimelike_to_i8, @@ -78,7 +78,7 @@ class DatetimeIndexOpsMixin(ExtensionOpsMixin): common ops mixin to support a unified interface datetimelike Index """ - _data = None + _data: ExtensionArray # DatetimeLikeArrayMixin assumes subclasses are mutable, so these are # properties there. They can be made into cache_readonly for Index @@ -836,7 +836,7 @@ class DatetimelikeDelegateMixin(PandasDelegate): # raw_properties : dispatch properties that shouldn't be boxed in an Index _raw_properties = set() # type: Set[str] name = None - _data = None + _data: ExtensionArray @property def _delegate_class(self): diff --git a/pandas/core/indexes/period.py b/pandas/core/indexes/period.py index 3bcb9ba345713..2b70049fd14af 100644 --- a/pandas/core/indexes/period.py +++ b/pandas/core/indexes/period.py @@ -174,7 +174,7 @@ class PeriodIndex(DatetimeIndexOpsMixin, Int64Index, PeriodDelegateMixin): _is_numeric_dtype = False _infer_as_myclass = True - _data = None + _data: PeriodArray _engine_type = libindex.PeriodEngine _supports_partial_string_indexing = True From 33181d92a97a57e2ac0bf5efee9d6c301be430af Mon Sep 17 00:00:00 2001 From: alimcmaster1 Date: Wed, 13 Nov 2019 16:58:07 +0000 Subject: [PATCH 039/185] Make subdirs in tests/io/data (#29513) --- pandas/plotting/_misc.py | 2 +- pandas/tests/io/data/{ => csv}/banklist.csv | 0 pandas/tests/io/data/{ => csv}/iris.csv | 0 pandas/tests/io/data/{ => csv}/test1.csv | 0 pandas/tests/io/data/{ => csv}/test_mmap.csv | 0 pandas/tests/io/data/{ => csv}/tips.csv | 0 pandas/tests/io/data/{ => excel}/blank.ods | Bin pandas/tests/io/data/{ => excel}/blank.xls | Bin pandas/tests/io/data/{ => excel}/blank.xlsm | Bin pandas/tests/io/data/{ => excel}/blank.xlsx | Bin .../io/data/{ => excel}/blank_with_header.ods | Bin .../io/data/{ => excel}/blank_with_header.xls | Bin .../data/{ => excel}/blank_with_header.xlsm | Bin .../data/{ => excel}/blank_with_header.xlsx | Bin .../data/{ => excel}/invalid_value_type.ods | Bin pandas/tests/io/data/{ => excel}/test1.ods | Bin pandas/tests/io/data/{ => excel}/test1.xls | Bin pandas/tests/io/data/{ => excel}/test1.xlsm | Bin pandas/tests/io/data/{ => excel}/test1.xlsx | Bin pandas/tests/io/data/{ => excel}/test2.ods | Bin pandas/tests/io/data/{ => excel}/test2.xls | Bin pandas/tests/io/data/{ => excel}/test2.xlsm | Bin pandas/tests/io/data/{ => excel}/test2.xlsx | Bin pandas/tests/io/data/{ => excel}/test3.ods | Bin pandas/tests/io/data/{ => excel}/test3.xls | Bin pandas/tests/io/data/{ => excel}/test3.xlsm | Bin pandas/tests/io/data/{ => excel}/test3.xlsx | Bin pandas/tests/io/data/{ => excel}/test4.ods | Bin pandas/tests/io/data/{ => excel}/test4.xls | Bin pandas/tests/io/data/{ => excel}/test4.xlsm | Bin pandas/tests/io/data/{ => excel}/test4.xlsx | Bin pandas/tests/io/data/{ => excel}/test5.ods | Bin pandas/tests/io/data/{ => excel}/test5.xls | Bin pandas/tests/io/data/{ => excel}/test5.xlsm | Bin pandas/tests/io/data/{ => excel}/test5.xlsx | Bin .../io/data/{ => excel}/test_converters.ods | Bin .../io/data/{ => excel}/test_converters.xls | Bin .../io/data/{ => excel}/test_converters.xlsm | Bin .../io/data/{ => excel}/test_converters.xlsx | Bin .../{ => excel}/test_index_name_pre17.ods | Bin .../{ => excel}/test_index_name_pre17.xls | Bin .../{ => excel}/test_index_name_pre17.xlsm | Bin .../{ => excel}/test_index_name_pre17.xlsx | Bin .../io/data/{ => excel}/test_multisheet.ods | Bin .../io/data/{ => excel}/test_multisheet.xls | Bin .../io/data/{ => excel}/test_multisheet.xlsm | Bin .../io/data/{ => excel}/test_multisheet.xlsx | Bin .../io/data/{ => excel}/test_squeeze.ods | Bin .../io/data/{ => excel}/test_squeeze.xls | Bin .../io/data/{ => excel}/test_squeeze.xlsm | Bin .../io/data/{ => excel}/test_squeeze.xlsx | Bin .../tests/io/data/{ => excel}/test_types.ods | Bin .../tests/io/data/{ => excel}/test_types.xls | Bin .../tests/io/data/{ => excel}/test_types.xlsm | Bin .../tests/io/data/{ => excel}/test_types.xlsx | Bin .../io/data/{ => excel}/testdateoverflow.ods | Bin .../io/data/{ => excel}/testdateoverflow.xls | Bin .../io/data/{ => excel}/testdateoverflow.xlsm | Bin .../io/data/{ => excel}/testdateoverflow.xlsx | Bin .../tests/io/data/{ => excel}/testdtype.ods | Bin .../tests/io/data/{ => excel}/testdtype.xls | Bin .../tests/io/data/{ => excel}/testdtype.xlsm | Bin .../tests/io/data/{ => excel}/testdtype.xlsx | Bin .../io/data/{ => excel}/testmultiindex.ods | Bin .../io/data/{ => excel}/testmultiindex.xls | Bin .../io/data/{ => excel}/testmultiindex.xlsm | Bin .../io/data/{ => excel}/testmultiindex.xlsx | Bin .../io/data/{ => excel}/testskiprows.ods | Bin .../io/data/{ => excel}/testskiprows.xls | Bin .../io/data/{ => excel}/testskiprows.xlsm | Bin .../io/data/{ => excel}/testskiprows.xlsx | Bin .../tests/io/data/{ => excel}/times_1900.ods | Bin .../tests/io/data/{ => excel}/times_1900.xls | Bin .../tests/io/data/{ => excel}/times_1900.xlsm | Bin .../tests/io/data/{ => excel}/times_1900.xlsx | Bin .../tests/io/data/{ => excel}/times_1904.ods | Bin .../tests/io/data/{ => excel}/times_1904.xls | Bin .../tests/io/data/{ => excel}/times_1904.xlsm | Bin .../tests/io/data/{ => excel}/times_1904.xlsx | Bin .../tests/io/data/{ => excel}/writertable.odt | Bin .../data/{ => feather}/feather-0_3_1.feather | Bin .../{ => fixed_width}/fixed_width_format.txt | 0 pandas/tests/io/data/{ => html}/banklist.html | 0 .../data/{ => html}/computer_sales_page.html | 0 pandas/tests/io/data/{ => html}/macau.html | 0 pandas/tests/io/data/{ => html}/nyse_wsj.html | 0 pandas/tests/io/data/{ => html}/spam.html | 0 .../io/data/{ => html}/valid_markup.html | 0 .../io/data/{ => html}/wikipedia_states.html | 0 .../{ => pickle}/categorical.0.25.0.pickle | Bin .../{ => pickle}/sparseframe-0.20.3.pickle.gz | Bin .../sparseseries-0.20.3.pickle.gz | Bin .../tests/io/data/{ => pickle}/test_py27.pkl | Bin .../io/data/{ => spss}/labelled-num-na.sav | Bin .../tests/io/data/{ => spss}/labelled-num.sav | Bin .../tests/io/data/{ => spss}/labelled-str.sav | Bin pandas/tests/io/data/{ => spss}/umlauts.sav | Bin pandas/tests/io/data/{ => stata}/S4_EDUC1.dta | Bin .../tests/io/data/{ => stata}/stata10_115.dta | Bin .../tests/io/data/{ => stata}/stata10_117.dta | Bin .../tests/io/data/{ => stata}/stata11_115.dta | Bin .../tests/io/data/{ => stata}/stata11_117.dta | Bin .../tests/io/data/{ => stata}/stata12_117.dta | Bin .../io/data/{ => stata}/stata13_dates.dta | Bin .../tests/io/data/{ => stata}/stata14_118.dta | Bin pandas/tests/io/data/{ => stata}/stata15.dta | Bin .../tests/io/data/{ => stata}/stata16_118.dta | Bin .../tests/io/data/{ => stata}/stata1_114.dta | Bin .../tests/io/data/{ => stata}/stata1_117.dta | Bin .../io/data/{ => stata}/stata1_119.dta.gz | Bin .../io/data/{ => stata}/stata1_encoding.dta | Bin .../data/{ => stata}/stata1_encoding_118.dta | Bin .../tests/io/data/{ => stata}/stata2_113.dta | Bin .../tests/io/data/{ => stata}/stata2_114.dta | Bin .../tests/io/data/{ => stata}/stata2_115.dta | Bin .../tests/io/data/{ => stata}/stata2_117.dta | Bin pandas/tests/io/data/{ => stata}/stata3.csv | 0 .../tests/io/data/{ => stata}/stata3_113.dta | Bin .../tests/io/data/{ => stata}/stata3_114.dta | Bin .../tests/io/data/{ => stata}/stata3_115.dta | Bin .../tests/io/data/{ => stata}/stata3_117.dta | Bin .../tests/io/data/{ => stata}/stata4_113.dta | Bin .../tests/io/data/{ => stata}/stata4_114.dta | Bin .../tests/io/data/{ => stata}/stata4_115.dta | Bin .../tests/io/data/{ => stata}/stata4_117.dta | Bin pandas/tests/io/data/{ => stata}/stata5.csv | 0 .../tests/io/data/{ => stata}/stata5_113.dta | Bin .../tests/io/data/{ => stata}/stata5_114.dta | Bin .../tests/io/data/{ => stata}/stata5_115.dta | Bin .../tests/io/data/{ => stata}/stata5_117.dta | Bin pandas/tests/io/data/{ => stata}/stata6.csv | 0 .../tests/io/data/{ => stata}/stata6_113.dta | Bin .../tests/io/data/{ => stata}/stata6_114.dta | Bin .../tests/io/data/{ => stata}/stata6_115.dta | Bin .../tests/io/data/{ => stata}/stata6_117.dta | Bin .../tests/io/data/{ => stata}/stata7_111.dta | Bin .../tests/io/data/{ => stata}/stata7_115.dta | Bin .../tests/io/data/{ => stata}/stata7_117.dta | Bin .../tests/io/data/{ => stata}/stata8_113.dta | Bin .../tests/io/data/{ => stata}/stata8_115.dta | Bin .../tests/io/data/{ => stata}/stata8_117.dta | Bin .../tests/io/data/{ => stata}/stata9_115.dta | Bin .../tests/io/data/{ => stata}/stata9_117.dta | Bin pandas/tests/io/excel/conftest.py | 5 +-- pandas/tests/io/excel/test_odf.py | 2 +- pandas/tests/io/excel/test_readers.py | 12 ++++--- pandas/tests/io/excel/test_xlrd.py | 2 +- pandas/tests/io/test_common.py | 28 ++++++++++----- pandas/tests/io/test_html.py | 34 +++++++++--------- pandas/tests/io/test_pickle.py | 12 ++++--- pandas/tests/io/test_spss.py | 10 +++--- pandas/tests/io/test_sql.py | 2 +- pandas/tests/io/test_stata.py | 4 +-- 153 files changed, 66 insertions(+), 47 deletions(-) rename pandas/tests/io/data/{ => csv}/banklist.csv (100%) rename pandas/tests/io/data/{ => csv}/iris.csv (100%) rename pandas/tests/io/data/{ => csv}/test1.csv (100%) rename pandas/tests/io/data/{ => csv}/test_mmap.csv (100%) rename pandas/tests/io/data/{ => csv}/tips.csv (100%) rename pandas/tests/io/data/{ => excel}/blank.ods (100%) rename pandas/tests/io/data/{ => excel}/blank.xls (100%) rename pandas/tests/io/data/{ => excel}/blank.xlsm (100%) rename pandas/tests/io/data/{ => excel}/blank.xlsx (100%) rename pandas/tests/io/data/{ => excel}/blank_with_header.ods (100%) rename pandas/tests/io/data/{ => excel}/blank_with_header.xls (100%) rename pandas/tests/io/data/{ => excel}/blank_with_header.xlsm (100%) rename pandas/tests/io/data/{ => excel}/blank_with_header.xlsx (100%) rename pandas/tests/io/data/{ => excel}/invalid_value_type.ods (100%) rename pandas/tests/io/data/{ => excel}/test1.ods (100%) rename pandas/tests/io/data/{ => excel}/test1.xls (100%) rename pandas/tests/io/data/{ => excel}/test1.xlsm (100%) rename pandas/tests/io/data/{ => excel}/test1.xlsx (100%) rename pandas/tests/io/data/{ => excel}/test2.ods (100%) rename pandas/tests/io/data/{ => excel}/test2.xls (100%) rename pandas/tests/io/data/{ => excel}/test2.xlsm (100%) rename pandas/tests/io/data/{ => excel}/test2.xlsx (100%) rename pandas/tests/io/data/{ => excel}/test3.ods (100%) rename pandas/tests/io/data/{ => excel}/test3.xls (100%) rename pandas/tests/io/data/{ => excel}/test3.xlsm (100%) rename pandas/tests/io/data/{ => excel}/test3.xlsx (100%) rename pandas/tests/io/data/{ => excel}/test4.ods (100%) rename pandas/tests/io/data/{ => excel}/test4.xls (100%) rename pandas/tests/io/data/{ => excel}/test4.xlsm (100%) rename pandas/tests/io/data/{ => excel}/test4.xlsx (100%) rename pandas/tests/io/data/{ => excel}/test5.ods (100%) rename pandas/tests/io/data/{ => excel}/test5.xls (100%) rename pandas/tests/io/data/{ => excel}/test5.xlsm (100%) rename pandas/tests/io/data/{ => excel}/test5.xlsx (100%) rename pandas/tests/io/data/{ => excel}/test_converters.ods (100%) rename pandas/tests/io/data/{ => excel}/test_converters.xls (100%) rename pandas/tests/io/data/{ => excel}/test_converters.xlsm (100%) rename pandas/tests/io/data/{ => excel}/test_converters.xlsx (100%) rename pandas/tests/io/data/{ => excel}/test_index_name_pre17.ods (100%) rename pandas/tests/io/data/{ => excel}/test_index_name_pre17.xls (100%) rename pandas/tests/io/data/{ => excel}/test_index_name_pre17.xlsm (100%) rename pandas/tests/io/data/{ => excel}/test_index_name_pre17.xlsx (100%) rename pandas/tests/io/data/{ => excel}/test_multisheet.ods (100%) rename pandas/tests/io/data/{ => excel}/test_multisheet.xls (100%) rename pandas/tests/io/data/{ => excel}/test_multisheet.xlsm (100%) rename pandas/tests/io/data/{ => excel}/test_multisheet.xlsx (100%) rename pandas/tests/io/data/{ => excel}/test_squeeze.ods (100%) rename pandas/tests/io/data/{ => excel}/test_squeeze.xls (100%) rename pandas/tests/io/data/{ => excel}/test_squeeze.xlsm (100%) rename pandas/tests/io/data/{ => excel}/test_squeeze.xlsx (100%) rename pandas/tests/io/data/{ => excel}/test_types.ods (100%) rename pandas/tests/io/data/{ => excel}/test_types.xls (100%) rename pandas/tests/io/data/{ => excel}/test_types.xlsm (100%) rename pandas/tests/io/data/{ => excel}/test_types.xlsx (100%) rename pandas/tests/io/data/{ => excel}/testdateoverflow.ods (100%) rename pandas/tests/io/data/{ => excel}/testdateoverflow.xls (100%) rename pandas/tests/io/data/{ => excel}/testdateoverflow.xlsm (100%) rename pandas/tests/io/data/{ => excel}/testdateoverflow.xlsx (100%) rename pandas/tests/io/data/{ => excel}/testdtype.ods (100%) rename pandas/tests/io/data/{ => excel}/testdtype.xls (100%) rename pandas/tests/io/data/{ => excel}/testdtype.xlsm (100%) rename pandas/tests/io/data/{ => excel}/testdtype.xlsx (100%) rename pandas/tests/io/data/{ => excel}/testmultiindex.ods (100%) rename pandas/tests/io/data/{ => excel}/testmultiindex.xls (100%) rename pandas/tests/io/data/{ => excel}/testmultiindex.xlsm (100%) rename pandas/tests/io/data/{ => excel}/testmultiindex.xlsx (100%) rename pandas/tests/io/data/{ => excel}/testskiprows.ods (100%) rename pandas/tests/io/data/{ => excel}/testskiprows.xls (100%) rename pandas/tests/io/data/{ => excel}/testskiprows.xlsm (100%) rename pandas/tests/io/data/{ => excel}/testskiprows.xlsx (100%) rename pandas/tests/io/data/{ => excel}/times_1900.ods (100%) rename pandas/tests/io/data/{ => excel}/times_1900.xls (100%) rename pandas/tests/io/data/{ => excel}/times_1900.xlsm (100%) rename pandas/tests/io/data/{ => excel}/times_1900.xlsx (100%) rename pandas/tests/io/data/{ => excel}/times_1904.ods (100%) rename pandas/tests/io/data/{ => excel}/times_1904.xls (100%) rename pandas/tests/io/data/{ => excel}/times_1904.xlsm (100%) rename pandas/tests/io/data/{ => excel}/times_1904.xlsx (100%) rename pandas/tests/io/data/{ => excel}/writertable.odt (100%) rename pandas/tests/io/data/{ => feather}/feather-0_3_1.feather (100%) rename pandas/tests/io/data/{ => fixed_width}/fixed_width_format.txt (100%) rename pandas/tests/io/data/{ => html}/banklist.html (100%) rename pandas/tests/io/data/{ => html}/computer_sales_page.html (100%) rename pandas/tests/io/data/{ => html}/macau.html (100%) rename pandas/tests/io/data/{ => html}/nyse_wsj.html (100%) rename pandas/tests/io/data/{ => html}/spam.html (100%) rename pandas/tests/io/data/{ => html}/valid_markup.html (100%) rename pandas/tests/io/data/{ => html}/wikipedia_states.html (100%) rename pandas/tests/io/data/{ => pickle}/categorical.0.25.0.pickle (100%) rename pandas/tests/io/data/{ => pickle}/sparseframe-0.20.3.pickle.gz (100%) rename pandas/tests/io/data/{ => pickle}/sparseseries-0.20.3.pickle.gz (100%) rename pandas/tests/io/data/{ => pickle}/test_py27.pkl (100%) rename pandas/tests/io/data/{ => spss}/labelled-num-na.sav (100%) rename pandas/tests/io/data/{ => spss}/labelled-num.sav (100%) rename pandas/tests/io/data/{ => spss}/labelled-str.sav (100%) rename pandas/tests/io/data/{ => spss}/umlauts.sav (100%) rename pandas/tests/io/data/{ => stata}/S4_EDUC1.dta (100%) rename pandas/tests/io/data/{ => stata}/stata10_115.dta (100%) rename pandas/tests/io/data/{ => stata}/stata10_117.dta (100%) rename pandas/tests/io/data/{ => stata}/stata11_115.dta (100%) rename pandas/tests/io/data/{ => stata}/stata11_117.dta (100%) rename pandas/tests/io/data/{ => stata}/stata12_117.dta (100%) rename pandas/tests/io/data/{ => stata}/stata13_dates.dta (100%) rename pandas/tests/io/data/{ => stata}/stata14_118.dta (100%) rename pandas/tests/io/data/{ => stata}/stata15.dta (100%) rename pandas/tests/io/data/{ => stata}/stata16_118.dta (100%) rename pandas/tests/io/data/{ => stata}/stata1_114.dta (100%) rename pandas/tests/io/data/{ => stata}/stata1_117.dta (100%) rename pandas/tests/io/data/{ => stata}/stata1_119.dta.gz (100%) rename pandas/tests/io/data/{ => stata}/stata1_encoding.dta (100%) rename pandas/tests/io/data/{ => stata}/stata1_encoding_118.dta (100%) rename pandas/tests/io/data/{ => stata}/stata2_113.dta (100%) rename pandas/tests/io/data/{ => stata}/stata2_114.dta (100%) rename pandas/tests/io/data/{ => stata}/stata2_115.dta (100%) rename pandas/tests/io/data/{ => stata}/stata2_117.dta (100%) rename pandas/tests/io/data/{ => stata}/stata3.csv (100%) rename pandas/tests/io/data/{ => stata}/stata3_113.dta (100%) rename pandas/tests/io/data/{ => stata}/stata3_114.dta (100%) rename pandas/tests/io/data/{ => stata}/stata3_115.dta (100%) rename pandas/tests/io/data/{ => stata}/stata3_117.dta (100%) rename pandas/tests/io/data/{ => stata}/stata4_113.dta (100%) rename pandas/tests/io/data/{ => stata}/stata4_114.dta (100%) rename pandas/tests/io/data/{ => stata}/stata4_115.dta (100%) rename pandas/tests/io/data/{ => stata}/stata4_117.dta (100%) rename pandas/tests/io/data/{ => stata}/stata5.csv (100%) rename pandas/tests/io/data/{ => stata}/stata5_113.dta (100%) rename pandas/tests/io/data/{ => stata}/stata5_114.dta (100%) rename pandas/tests/io/data/{ => stata}/stata5_115.dta (100%) rename pandas/tests/io/data/{ => stata}/stata5_117.dta (100%) rename pandas/tests/io/data/{ => stata}/stata6.csv (100%) rename pandas/tests/io/data/{ => stata}/stata6_113.dta (100%) rename pandas/tests/io/data/{ => stata}/stata6_114.dta (100%) rename pandas/tests/io/data/{ => stata}/stata6_115.dta (100%) rename pandas/tests/io/data/{ => stata}/stata6_117.dta (100%) rename pandas/tests/io/data/{ => stata}/stata7_111.dta (100%) rename pandas/tests/io/data/{ => stata}/stata7_115.dta (100%) rename pandas/tests/io/data/{ => stata}/stata7_117.dta (100%) rename pandas/tests/io/data/{ => stata}/stata8_113.dta (100%) rename pandas/tests/io/data/{ => stata}/stata8_115.dta (100%) rename pandas/tests/io/data/{ => stata}/stata8_117.dta (100%) rename pandas/tests/io/data/{ => stata}/stata9_115.dta (100%) rename pandas/tests/io/data/{ => stata}/stata9_117.dta (100%) diff --git a/pandas/plotting/_misc.py b/pandas/plotting/_misc.py index 50b3b405692a5..59cbdb0318e5a 100644 --- a/pandas/plotting/_misc.py +++ b/pandas/plotting/_misc.py @@ -364,7 +364,7 @@ def parallel_coordinates( -------- >>> from matplotlib import pyplot as plt >>> df = pd.read_csv('https://raw.github.com/pandas-dev/pandas/master' - '/pandas/tests/data/iris.csv') + '/pandas/tests/data/csv/iris.csv') >>> pd.plotting.parallel_coordinates( df, 'Name', color=('#556270', '#4ECDC4', '#C7F464')) diff --git a/pandas/tests/io/data/banklist.csv b/pandas/tests/io/data/csv/banklist.csv similarity index 100% rename from pandas/tests/io/data/banklist.csv rename to pandas/tests/io/data/csv/banklist.csv diff --git a/pandas/tests/io/data/iris.csv b/pandas/tests/io/data/csv/iris.csv similarity index 100% rename from pandas/tests/io/data/iris.csv rename to pandas/tests/io/data/csv/iris.csv diff --git a/pandas/tests/io/data/test1.csv b/pandas/tests/io/data/csv/test1.csv similarity index 100% rename from pandas/tests/io/data/test1.csv rename to pandas/tests/io/data/csv/test1.csv diff --git a/pandas/tests/io/data/test_mmap.csv b/pandas/tests/io/data/csv/test_mmap.csv similarity index 100% rename from pandas/tests/io/data/test_mmap.csv rename to pandas/tests/io/data/csv/test_mmap.csv diff --git a/pandas/tests/io/data/tips.csv b/pandas/tests/io/data/csv/tips.csv similarity index 100% rename from pandas/tests/io/data/tips.csv rename to pandas/tests/io/data/csv/tips.csv diff --git a/pandas/tests/io/data/blank.ods b/pandas/tests/io/data/excel/blank.ods similarity index 100% rename from pandas/tests/io/data/blank.ods rename to pandas/tests/io/data/excel/blank.ods diff --git a/pandas/tests/io/data/blank.xls b/pandas/tests/io/data/excel/blank.xls similarity index 100% rename from pandas/tests/io/data/blank.xls rename to pandas/tests/io/data/excel/blank.xls diff --git a/pandas/tests/io/data/blank.xlsm b/pandas/tests/io/data/excel/blank.xlsm similarity index 100% rename from pandas/tests/io/data/blank.xlsm rename to pandas/tests/io/data/excel/blank.xlsm diff --git a/pandas/tests/io/data/blank.xlsx b/pandas/tests/io/data/excel/blank.xlsx similarity index 100% rename from pandas/tests/io/data/blank.xlsx rename to pandas/tests/io/data/excel/blank.xlsx diff --git a/pandas/tests/io/data/blank_with_header.ods b/pandas/tests/io/data/excel/blank_with_header.ods similarity index 100% rename from pandas/tests/io/data/blank_with_header.ods rename to pandas/tests/io/data/excel/blank_with_header.ods diff --git a/pandas/tests/io/data/blank_with_header.xls b/pandas/tests/io/data/excel/blank_with_header.xls similarity index 100% rename from pandas/tests/io/data/blank_with_header.xls rename to pandas/tests/io/data/excel/blank_with_header.xls diff --git a/pandas/tests/io/data/blank_with_header.xlsm b/pandas/tests/io/data/excel/blank_with_header.xlsm similarity index 100% rename from pandas/tests/io/data/blank_with_header.xlsm rename to pandas/tests/io/data/excel/blank_with_header.xlsm diff --git a/pandas/tests/io/data/blank_with_header.xlsx b/pandas/tests/io/data/excel/blank_with_header.xlsx similarity index 100% rename from pandas/tests/io/data/blank_with_header.xlsx rename to pandas/tests/io/data/excel/blank_with_header.xlsx diff --git a/pandas/tests/io/data/invalid_value_type.ods b/pandas/tests/io/data/excel/invalid_value_type.ods similarity index 100% rename from pandas/tests/io/data/invalid_value_type.ods rename to pandas/tests/io/data/excel/invalid_value_type.ods diff --git a/pandas/tests/io/data/test1.ods b/pandas/tests/io/data/excel/test1.ods similarity index 100% rename from pandas/tests/io/data/test1.ods rename to pandas/tests/io/data/excel/test1.ods diff --git a/pandas/tests/io/data/test1.xls b/pandas/tests/io/data/excel/test1.xls similarity index 100% rename from pandas/tests/io/data/test1.xls rename to pandas/tests/io/data/excel/test1.xls diff --git a/pandas/tests/io/data/test1.xlsm b/pandas/tests/io/data/excel/test1.xlsm similarity index 100% rename from pandas/tests/io/data/test1.xlsm rename to pandas/tests/io/data/excel/test1.xlsm diff --git a/pandas/tests/io/data/test1.xlsx b/pandas/tests/io/data/excel/test1.xlsx similarity index 100% rename from pandas/tests/io/data/test1.xlsx rename to pandas/tests/io/data/excel/test1.xlsx diff --git a/pandas/tests/io/data/test2.ods b/pandas/tests/io/data/excel/test2.ods similarity index 100% rename from pandas/tests/io/data/test2.ods rename to pandas/tests/io/data/excel/test2.ods diff --git a/pandas/tests/io/data/test2.xls b/pandas/tests/io/data/excel/test2.xls similarity index 100% rename from pandas/tests/io/data/test2.xls rename to pandas/tests/io/data/excel/test2.xls diff --git a/pandas/tests/io/data/test2.xlsm b/pandas/tests/io/data/excel/test2.xlsm similarity index 100% rename from pandas/tests/io/data/test2.xlsm rename to pandas/tests/io/data/excel/test2.xlsm diff --git a/pandas/tests/io/data/test2.xlsx b/pandas/tests/io/data/excel/test2.xlsx similarity index 100% rename from pandas/tests/io/data/test2.xlsx rename to pandas/tests/io/data/excel/test2.xlsx diff --git a/pandas/tests/io/data/test3.ods b/pandas/tests/io/data/excel/test3.ods similarity index 100% rename from pandas/tests/io/data/test3.ods rename to pandas/tests/io/data/excel/test3.ods diff --git a/pandas/tests/io/data/test3.xls b/pandas/tests/io/data/excel/test3.xls similarity index 100% rename from pandas/tests/io/data/test3.xls rename to pandas/tests/io/data/excel/test3.xls diff --git a/pandas/tests/io/data/test3.xlsm b/pandas/tests/io/data/excel/test3.xlsm similarity index 100% rename from pandas/tests/io/data/test3.xlsm rename to pandas/tests/io/data/excel/test3.xlsm diff --git a/pandas/tests/io/data/test3.xlsx b/pandas/tests/io/data/excel/test3.xlsx similarity index 100% rename from pandas/tests/io/data/test3.xlsx rename to pandas/tests/io/data/excel/test3.xlsx diff --git a/pandas/tests/io/data/test4.ods b/pandas/tests/io/data/excel/test4.ods similarity index 100% rename from pandas/tests/io/data/test4.ods rename to pandas/tests/io/data/excel/test4.ods diff --git a/pandas/tests/io/data/test4.xls b/pandas/tests/io/data/excel/test4.xls similarity index 100% rename from pandas/tests/io/data/test4.xls rename to pandas/tests/io/data/excel/test4.xls diff --git a/pandas/tests/io/data/test4.xlsm b/pandas/tests/io/data/excel/test4.xlsm similarity index 100% rename from pandas/tests/io/data/test4.xlsm rename to pandas/tests/io/data/excel/test4.xlsm diff --git a/pandas/tests/io/data/test4.xlsx b/pandas/tests/io/data/excel/test4.xlsx similarity index 100% rename from pandas/tests/io/data/test4.xlsx rename to pandas/tests/io/data/excel/test4.xlsx diff --git a/pandas/tests/io/data/test5.ods b/pandas/tests/io/data/excel/test5.ods similarity index 100% rename from pandas/tests/io/data/test5.ods rename to pandas/tests/io/data/excel/test5.ods diff --git a/pandas/tests/io/data/test5.xls b/pandas/tests/io/data/excel/test5.xls similarity index 100% rename from pandas/tests/io/data/test5.xls rename to pandas/tests/io/data/excel/test5.xls diff --git a/pandas/tests/io/data/test5.xlsm b/pandas/tests/io/data/excel/test5.xlsm similarity index 100% rename from pandas/tests/io/data/test5.xlsm rename to pandas/tests/io/data/excel/test5.xlsm diff --git a/pandas/tests/io/data/test5.xlsx b/pandas/tests/io/data/excel/test5.xlsx similarity index 100% rename from pandas/tests/io/data/test5.xlsx rename to pandas/tests/io/data/excel/test5.xlsx diff --git a/pandas/tests/io/data/test_converters.ods b/pandas/tests/io/data/excel/test_converters.ods similarity index 100% rename from pandas/tests/io/data/test_converters.ods rename to pandas/tests/io/data/excel/test_converters.ods diff --git a/pandas/tests/io/data/test_converters.xls b/pandas/tests/io/data/excel/test_converters.xls similarity index 100% rename from pandas/tests/io/data/test_converters.xls rename to pandas/tests/io/data/excel/test_converters.xls diff --git a/pandas/tests/io/data/test_converters.xlsm b/pandas/tests/io/data/excel/test_converters.xlsm similarity index 100% rename from pandas/tests/io/data/test_converters.xlsm rename to pandas/tests/io/data/excel/test_converters.xlsm diff --git a/pandas/tests/io/data/test_converters.xlsx b/pandas/tests/io/data/excel/test_converters.xlsx similarity index 100% rename from pandas/tests/io/data/test_converters.xlsx rename to pandas/tests/io/data/excel/test_converters.xlsx diff --git a/pandas/tests/io/data/test_index_name_pre17.ods b/pandas/tests/io/data/excel/test_index_name_pre17.ods similarity index 100% rename from pandas/tests/io/data/test_index_name_pre17.ods rename to pandas/tests/io/data/excel/test_index_name_pre17.ods diff --git a/pandas/tests/io/data/test_index_name_pre17.xls b/pandas/tests/io/data/excel/test_index_name_pre17.xls similarity index 100% rename from pandas/tests/io/data/test_index_name_pre17.xls rename to pandas/tests/io/data/excel/test_index_name_pre17.xls diff --git a/pandas/tests/io/data/test_index_name_pre17.xlsm b/pandas/tests/io/data/excel/test_index_name_pre17.xlsm similarity index 100% rename from pandas/tests/io/data/test_index_name_pre17.xlsm rename to pandas/tests/io/data/excel/test_index_name_pre17.xlsm diff --git a/pandas/tests/io/data/test_index_name_pre17.xlsx b/pandas/tests/io/data/excel/test_index_name_pre17.xlsx similarity index 100% rename from pandas/tests/io/data/test_index_name_pre17.xlsx rename to pandas/tests/io/data/excel/test_index_name_pre17.xlsx diff --git a/pandas/tests/io/data/test_multisheet.ods b/pandas/tests/io/data/excel/test_multisheet.ods similarity index 100% rename from pandas/tests/io/data/test_multisheet.ods rename to pandas/tests/io/data/excel/test_multisheet.ods diff --git a/pandas/tests/io/data/test_multisheet.xls b/pandas/tests/io/data/excel/test_multisheet.xls similarity index 100% rename from pandas/tests/io/data/test_multisheet.xls rename to pandas/tests/io/data/excel/test_multisheet.xls diff --git a/pandas/tests/io/data/test_multisheet.xlsm b/pandas/tests/io/data/excel/test_multisheet.xlsm similarity index 100% rename from pandas/tests/io/data/test_multisheet.xlsm rename to pandas/tests/io/data/excel/test_multisheet.xlsm diff --git a/pandas/tests/io/data/test_multisheet.xlsx b/pandas/tests/io/data/excel/test_multisheet.xlsx similarity index 100% rename from pandas/tests/io/data/test_multisheet.xlsx rename to pandas/tests/io/data/excel/test_multisheet.xlsx diff --git a/pandas/tests/io/data/test_squeeze.ods b/pandas/tests/io/data/excel/test_squeeze.ods similarity index 100% rename from pandas/tests/io/data/test_squeeze.ods rename to pandas/tests/io/data/excel/test_squeeze.ods diff --git a/pandas/tests/io/data/test_squeeze.xls b/pandas/tests/io/data/excel/test_squeeze.xls similarity index 100% rename from pandas/tests/io/data/test_squeeze.xls rename to pandas/tests/io/data/excel/test_squeeze.xls diff --git a/pandas/tests/io/data/test_squeeze.xlsm b/pandas/tests/io/data/excel/test_squeeze.xlsm similarity index 100% rename from pandas/tests/io/data/test_squeeze.xlsm rename to pandas/tests/io/data/excel/test_squeeze.xlsm diff --git a/pandas/tests/io/data/test_squeeze.xlsx b/pandas/tests/io/data/excel/test_squeeze.xlsx similarity index 100% rename from pandas/tests/io/data/test_squeeze.xlsx rename to pandas/tests/io/data/excel/test_squeeze.xlsx diff --git a/pandas/tests/io/data/test_types.ods b/pandas/tests/io/data/excel/test_types.ods similarity index 100% rename from pandas/tests/io/data/test_types.ods rename to pandas/tests/io/data/excel/test_types.ods diff --git a/pandas/tests/io/data/test_types.xls b/pandas/tests/io/data/excel/test_types.xls similarity index 100% rename from pandas/tests/io/data/test_types.xls rename to pandas/tests/io/data/excel/test_types.xls diff --git a/pandas/tests/io/data/test_types.xlsm b/pandas/tests/io/data/excel/test_types.xlsm similarity index 100% rename from pandas/tests/io/data/test_types.xlsm rename to pandas/tests/io/data/excel/test_types.xlsm diff --git a/pandas/tests/io/data/test_types.xlsx b/pandas/tests/io/data/excel/test_types.xlsx similarity index 100% rename from pandas/tests/io/data/test_types.xlsx rename to pandas/tests/io/data/excel/test_types.xlsx diff --git a/pandas/tests/io/data/testdateoverflow.ods b/pandas/tests/io/data/excel/testdateoverflow.ods similarity index 100% rename from pandas/tests/io/data/testdateoverflow.ods rename to pandas/tests/io/data/excel/testdateoverflow.ods diff --git a/pandas/tests/io/data/testdateoverflow.xls b/pandas/tests/io/data/excel/testdateoverflow.xls similarity index 100% rename from pandas/tests/io/data/testdateoverflow.xls rename to pandas/tests/io/data/excel/testdateoverflow.xls diff --git a/pandas/tests/io/data/testdateoverflow.xlsm b/pandas/tests/io/data/excel/testdateoverflow.xlsm similarity index 100% rename from pandas/tests/io/data/testdateoverflow.xlsm rename to pandas/tests/io/data/excel/testdateoverflow.xlsm diff --git a/pandas/tests/io/data/testdateoverflow.xlsx b/pandas/tests/io/data/excel/testdateoverflow.xlsx similarity index 100% rename from pandas/tests/io/data/testdateoverflow.xlsx rename to pandas/tests/io/data/excel/testdateoverflow.xlsx diff --git a/pandas/tests/io/data/testdtype.ods b/pandas/tests/io/data/excel/testdtype.ods similarity index 100% rename from pandas/tests/io/data/testdtype.ods rename to pandas/tests/io/data/excel/testdtype.ods diff --git a/pandas/tests/io/data/testdtype.xls b/pandas/tests/io/data/excel/testdtype.xls similarity index 100% rename from pandas/tests/io/data/testdtype.xls rename to pandas/tests/io/data/excel/testdtype.xls diff --git a/pandas/tests/io/data/testdtype.xlsm b/pandas/tests/io/data/excel/testdtype.xlsm similarity index 100% rename from pandas/tests/io/data/testdtype.xlsm rename to pandas/tests/io/data/excel/testdtype.xlsm diff --git a/pandas/tests/io/data/testdtype.xlsx b/pandas/tests/io/data/excel/testdtype.xlsx similarity index 100% rename from pandas/tests/io/data/testdtype.xlsx rename to pandas/tests/io/data/excel/testdtype.xlsx diff --git a/pandas/tests/io/data/testmultiindex.ods b/pandas/tests/io/data/excel/testmultiindex.ods similarity index 100% rename from pandas/tests/io/data/testmultiindex.ods rename to pandas/tests/io/data/excel/testmultiindex.ods diff --git a/pandas/tests/io/data/testmultiindex.xls b/pandas/tests/io/data/excel/testmultiindex.xls similarity index 100% rename from pandas/tests/io/data/testmultiindex.xls rename to pandas/tests/io/data/excel/testmultiindex.xls diff --git a/pandas/tests/io/data/testmultiindex.xlsm b/pandas/tests/io/data/excel/testmultiindex.xlsm similarity index 100% rename from pandas/tests/io/data/testmultiindex.xlsm rename to pandas/tests/io/data/excel/testmultiindex.xlsm diff --git a/pandas/tests/io/data/testmultiindex.xlsx b/pandas/tests/io/data/excel/testmultiindex.xlsx similarity index 100% rename from pandas/tests/io/data/testmultiindex.xlsx rename to pandas/tests/io/data/excel/testmultiindex.xlsx diff --git a/pandas/tests/io/data/testskiprows.ods b/pandas/tests/io/data/excel/testskiprows.ods similarity index 100% rename from pandas/tests/io/data/testskiprows.ods rename to pandas/tests/io/data/excel/testskiprows.ods diff --git a/pandas/tests/io/data/testskiprows.xls b/pandas/tests/io/data/excel/testskiprows.xls similarity index 100% rename from pandas/tests/io/data/testskiprows.xls rename to pandas/tests/io/data/excel/testskiprows.xls diff --git a/pandas/tests/io/data/testskiprows.xlsm b/pandas/tests/io/data/excel/testskiprows.xlsm similarity index 100% rename from pandas/tests/io/data/testskiprows.xlsm rename to pandas/tests/io/data/excel/testskiprows.xlsm diff --git a/pandas/tests/io/data/testskiprows.xlsx b/pandas/tests/io/data/excel/testskiprows.xlsx similarity index 100% rename from pandas/tests/io/data/testskiprows.xlsx rename to pandas/tests/io/data/excel/testskiprows.xlsx diff --git a/pandas/tests/io/data/times_1900.ods b/pandas/tests/io/data/excel/times_1900.ods similarity index 100% rename from pandas/tests/io/data/times_1900.ods rename to pandas/tests/io/data/excel/times_1900.ods diff --git a/pandas/tests/io/data/times_1900.xls b/pandas/tests/io/data/excel/times_1900.xls similarity index 100% rename from pandas/tests/io/data/times_1900.xls rename to pandas/tests/io/data/excel/times_1900.xls diff --git a/pandas/tests/io/data/times_1900.xlsm b/pandas/tests/io/data/excel/times_1900.xlsm similarity index 100% rename from pandas/tests/io/data/times_1900.xlsm rename to pandas/tests/io/data/excel/times_1900.xlsm diff --git a/pandas/tests/io/data/times_1900.xlsx b/pandas/tests/io/data/excel/times_1900.xlsx similarity index 100% rename from pandas/tests/io/data/times_1900.xlsx rename to pandas/tests/io/data/excel/times_1900.xlsx diff --git a/pandas/tests/io/data/times_1904.ods b/pandas/tests/io/data/excel/times_1904.ods similarity index 100% rename from pandas/tests/io/data/times_1904.ods rename to pandas/tests/io/data/excel/times_1904.ods diff --git a/pandas/tests/io/data/times_1904.xls b/pandas/tests/io/data/excel/times_1904.xls similarity index 100% rename from pandas/tests/io/data/times_1904.xls rename to pandas/tests/io/data/excel/times_1904.xls diff --git a/pandas/tests/io/data/times_1904.xlsm b/pandas/tests/io/data/excel/times_1904.xlsm similarity index 100% rename from pandas/tests/io/data/times_1904.xlsm rename to pandas/tests/io/data/excel/times_1904.xlsm diff --git a/pandas/tests/io/data/times_1904.xlsx b/pandas/tests/io/data/excel/times_1904.xlsx similarity index 100% rename from pandas/tests/io/data/times_1904.xlsx rename to pandas/tests/io/data/excel/times_1904.xlsx diff --git a/pandas/tests/io/data/writertable.odt b/pandas/tests/io/data/excel/writertable.odt similarity index 100% rename from pandas/tests/io/data/writertable.odt rename to pandas/tests/io/data/excel/writertable.odt diff --git a/pandas/tests/io/data/feather-0_3_1.feather b/pandas/tests/io/data/feather/feather-0_3_1.feather similarity index 100% rename from pandas/tests/io/data/feather-0_3_1.feather rename to pandas/tests/io/data/feather/feather-0_3_1.feather diff --git a/pandas/tests/io/data/fixed_width_format.txt b/pandas/tests/io/data/fixed_width/fixed_width_format.txt similarity index 100% rename from pandas/tests/io/data/fixed_width_format.txt rename to pandas/tests/io/data/fixed_width/fixed_width_format.txt diff --git a/pandas/tests/io/data/banklist.html b/pandas/tests/io/data/html/banklist.html similarity index 100% rename from pandas/tests/io/data/banklist.html rename to pandas/tests/io/data/html/banklist.html diff --git a/pandas/tests/io/data/computer_sales_page.html b/pandas/tests/io/data/html/computer_sales_page.html similarity index 100% rename from pandas/tests/io/data/computer_sales_page.html rename to pandas/tests/io/data/html/computer_sales_page.html diff --git a/pandas/tests/io/data/macau.html b/pandas/tests/io/data/html/macau.html similarity index 100% rename from pandas/tests/io/data/macau.html rename to pandas/tests/io/data/html/macau.html diff --git a/pandas/tests/io/data/nyse_wsj.html b/pandas/tests/io/data/html/nyse_wsj.html similarity index 100% rename from pandas/tests/io/data/nyse_wsj.html rename to pandas/tests/io/data/html/nyse_wsj.html diff --git a/pandas/tests/io/data/spam.html b/pandas/tests/io/data/html/spam.html similarity index 100% rename from pandas/tests/io/data/spam.html rename to pandas/tests/io/data/html/spam.html diff --git a/pandas/tests/io/data/valid_markup.html b/pandas/tests/io/data/html/valid_markup.html similarity index 100% rename from pandas/tests/io/data/valid_markup.html rename to pandas/tests/io/data/html/valid_markup.html diff --git a/pandas/tests/io/data/wikipedia_states.html b/pandas/tests/io/data/html/wikipedia_states.html similarity index 100% rename from pandas/tests/io/data/wikipedia_states.html rename to pandas/tests/io/data/html/wikipedia_states.html diff --git a/pandas/tests/io/data/categorical.0.25.0.pickle b/pandas/tests/io/data/pickle/categorical.0.25.0.pickle similarity index 100% rename from pandas/tests/io/data/categorical.0.25.0.pickle rename to pandas/tests/io/data/pickle/categorical.0.25.0.pickle diff --git a/pandas/tests/io/data/sparseframe-0.20.3.pickle.gz b/pandas/tests/io/data/pickle/sparseframe-0.20.3.pickle.gz similarity index 100% rename from pandas/tests/io/data/sparseframe-0.20.3.pickle.gz rename to pandas/tests/io/data/pickle/sparseframe-0.20.3.pickle.gz diff --git a/pandas/tests/io/data/sparseseries-0.20.3.pickle.gz b/pandas/tests/io/data/pickle/sparseseries-0.20.3.pickle.gz similarity index 100% rename from pandas/tests/io/data/sparseseries-0.20.3.pickle.gz rename to pandas/tests/io/data/pickle/sparseseries-0.20.3.pickle.gz diff --git a/pandas/tests/io/data/test_py27.pkl b/pandas/tests/io/data/pickle/test_py27.pkl similarity index 100% rename from pandas/tests/io/data/test_py27.pkl rename to pandas/tests/io/data/pickle/test_py27.pkl diff --git a/pandas/tests/io/data/labelled-num-na.sav b/pandas/tests/io/data/spss/labelled-num-na.sav similarity index 100% rename from pandas/tests/io/data/labelled-num-na.sav rename to pandas/tests/io/data/spss/labelled-num-na.sav diff --git a/pandas/tests/io/data/labelled-num.sav b/pandas/tests/io/data/spss/labelled-num.sav similarity index 100% rename from pandas/tests/io/data/labelled-num.sav rename to pandas/tests/io/data/spss/labelled-num.sav diff --git a/pandas/tests/io/data/labelled-str.sav b/pandas/tests/io/data/spss/labelled-str.sav similarity index 100% rename from pandas/tests/io/data/labelled-str.sav rename to pandas/tests/io/data/spss/labelled-str.sav diff --git a/pandas/tests/io/data/umlauts.sav b/pandas/tests/io/data/spss/umlauts.sav similarity index 100% rename from pandas/tests/io/data/umlauts.sav rename to pandas/tests/io/data/spss/umlauts.sav diff --git a/pandas/tests/io/data/S4_EDUC1.dta b/pandas/tests/io/data/stata/S4_EDUC1.dta similarity index 100% rename from pandas/tests/io/data/S4_EDUC1.dta rename to pandas/tests/io/data/stata/S4_EDUC1.dta diff --git a/pandas/tests/io/data/stata10_115.dta b/pandas/tests/io/data/stata/stata10_115.dta similarity index 100% rename from pandas/tests/io/data/stata10_115.dta rename to pandas/tests/io/data/stata/stata10_115.dta diff --git a/pandas/tests/io/data/stata10_117.dta b/pandas/tests/io/data/stata/stata10_117.dta similarity index 100% rename from pandas/tests/io/data/stata10_117.dta rename to pandas/tests/io/data/stata/stata10_117.dta diff --git a/pandas/tests/io/data/stata11_115.dta b/pandas/tests/io/data/stata/stata11_115.dta similarity index 100% rename from pandas/tests/io/data/stata11_115.dta rename to pandas/tests/io/data/stata/stata11_115.dta diff --git a/pandas/tests/io/data/stata11_117.dta b/pandas/tests/io/data/stata/stata11_117.dta similarity index 100% rename from pandas/tests/io/data/stata11_117.dta rename to pandas/tests/io/data/stata/stata11_117.dta diff --git a/pandas/tests/io/data/stata12_117.dta b/pandas/tests/io/data/stata/stata12_117.dta similarity index 100% rename from pandas/tests/io/data/stata12_117.dta rename to pandas/tests/io/data/stata/stata12_117.dta diff --git a/pandas/tests/io/data/stata13_dates.dta b/pandas/tests/io/data/stata/stata13_dates.dta similarity index 100% rename from pandas/tests/io/data/stata13_dates.dta rename to pandas/tests/io/data/stata/stata13_dates.dta diff --git a/pandas/tests/io/data/stata14_118.dta b/pandas/tests/io/data/stata/stata14_118.dta similarity index 100% rename from pandas/tests/io/data/stata14_118.dta rename to pandas/tests/io/data/stata/stata14_118.dta diff --git a/pandas/tests/io/data/stata15.dta b/pandas/tests/io/data/stata/stata15.dta similarity index 100% rename from pandas/tests/io/data/stata15.dta rename to pandas/tests/io/data/stata/stata15.dta diff --git a/pandas/tests/io/data/stata16_118.dta b/pandas/tests/io/data/stata/stata16_118.dta similarity index 100% rename from pandas/tests/io/data/stata16_118.dta rename to pandas/tests/io/data/stata/stata16_118.dta diff --git a/pandas/tests/io/data/stata1_114.dta b/pandas/tests/io/data/stata/stata1_114.dta similarity index 100% rename from pandas/tests/io/data/stata1_114.dta rename to pandas/tests/io/data/stata/stata1_114.dta diff --git a/pandas/tests/io/data/stata1_117.dta b/pandas/tests/io/data/stata/stata1_117.dta similarity index 100% rename from pandas/tests/io/data/stata1_117.dta rename to pandas/tests/io/data/stata/stata1_117.dta diff --git a/pandas/tests/io/data/stata1_119.dta.gz b/pandas/tests/io/data/stata/stata1_119.dta.gz similarity index 100% rename from pandas/tests/io/data/stata1_119.dta.gz rename to pandas/tests/io/data/stata/stata1_119.dta.gz diff --git a/pandas/tests/io/data/stata1_encoding.dta b/pandas/tests/io/data/stata/stata1_encoding.dta similarity index 100% rename from pandas/tests/io/data/stata1_encoding.dta rename to pandas/tests/io/data/stata/stata1_encoding.dta diff --git a/pandas/tests/io/data/stata1_encoding_118.dta b/pandas/tests/io/data/stata/stata1_encoding_118.dta similarity index 100% rename from pandas/tests/io/data/stata1_encoding_118.dta rename to pandas/tests/io/data/stata/stata1_encoding_118.dta diff --git a/pandas/tests/io/data/stata2_113.dta b/pandas/tests/io/data/stata/stata2_113.dta similarity index 100% rename from pandas/tests/io/data/stata2_113.dta rename to pandas/tests/io/data/stata/stata2_113.dta diff --git a/pandas/tests/io/data/stata2_114.dta b/pandas/tests/io/data/stata/stata2_114.dta similarity index 100% rename from pandas/tests/io/data/stata2_114.dta rename to pandas/tests/io/data/stata/stata2_114.dta diff --git a/pandas/tests/io/data/stata2_115.dta b/pandas/tests/io/data/stata/stata2_115.dta similarity index 100% rename from pandas/tests/io/data/stata2_115.dta rename to pandas/tests/io/data/stata/stata2_115.dta diff --git a/pandas/tests/io/data/stata2_117.dta b/pandas/tests/io/data/stata/stata2_117.dta similarity index 100% rename from pandas/tests/io/data/stata2_117.dta rename to pandas/tests/io/data/stata/stata2_117.dta diff --git a/pandas/tests/io/data/stata3.csv b/pandas/tests/io/data/stata/stata3.csv similarity index 100% rename from pandas/tests/io/data/stata3.csv rename to pandas/tests/io/data/stata/stata3.csv diff --git a/pandas/tests/io/data/stata3_113.dta b/pandas/tests/io/data/stata/stata3_113.dta similarity index 100% rename from pandas/tests/io/data/stata3_113.dta rename to pandas/tests/io/data/stata/stata3_113.dta diff --git a/pandas/tests/io/data/stata3_114.dta b/pandas/tests/io/data/stata/stata3_114.dta similarity index 100% rename from pandas/tests/io/data/stata3_114.dta rename to pandas/tests/io/data/stata/stata3_114.dta diff --git a/pandas/tests/io/data/stata3_115.dta b/pandas/tests/io/data/stata/stata3_115.dta similarity index 100% rename from pandas/tests/io/data/stata3_115.dta rename to pandas/tests/io/data/stata/stata3_115.dta diff --git a/pandas/tests/io/data/stata3_117.dta b/pandas/tests/io/data/stata/stata3_117.dta similarity index 100% rename from pandas/tests/io/data/stata3_117.dta rename to pandas/tests/io/data/stata/stata3_117.dta diff --git a/pandas/tests/io/data/stata4_113.dta b/pandas/tests/io/data/stata/stata4_113.dta similarity index 100% rename from pandas/tests/io/data/stata4_113.dta rename to pandas/tests/io/data/stata/stata4_113.dta diff --git a/pandas/tests/io/data/stata4_114.dta b/pandas/tests/io/data/stata/stata4_114.dta similarity index 100% rename from pandas/tests/io/data/stata4_114.dta rename to pandas/tests/io/data/stata/stata4_114.dta diff --git a/pandas/tests/io/data/stata4_115.dta b/pandas/tests/io/data/stata/stata4_115.dta similarity index 100% rename from pandas/tests/io/data/stata4_115.dta rename to pandas/tests/io/data/stata/stata4_115.dta diff --git a/pandas/tests/io/data/stata4_117.dta b/pandas/tests/io/data/stata/stata4_117.dta similarity index 100% rename from pandas/tests/io/data/stata4_117.dta rename to pandas/tests/io/data/stata/stata4_117.dta diff --git a/pandas/tests/io/data/stata5.csv b/pandas/tests/io/data/stata/stata5.csv similarity index 100% rename from pandas/tests/io/data/stata5.csv rename to pandas/tests/io/data/stata/stata5.csv diff --git a/pandas/tests/io/data/stata5_113.dta b/pandas/tests/io/data/stata/stata5_113.dta similarity index 100% rename from pandas/tests/io/data/stata5_113.dta rename to pandas/tests/io/data/stata/stata5_113.dta diff --git a/pandas/tests/io/data/stata5_114.dta b/pandas/tests/io/data/stata/stata5_114.dta similarity index 100% rename from pandas/tests/io/data/stata5_114.dta rename to pandas/tests/io/data/stata/stata5_114.dta diff --git a/pandas/tests/io/data/stata5_115.dta b/pandas/tests/io/data/stata/stata5_115.dta similarity index 100% rename from pandas/tests/io/data/stata5_115.dta rename to pandas/tests/io/data/stata/stata5_115.dta diff --git a/pandas/tests/io/data/stata5_117.dta b/pandas/tests/io/data/stata/stata5_117.dta similarity index 100% rename from pandas/tests/io/data/stata5_117.dta rename to pandas/tests/io/data/stata/stata5_117.dta diff --git a/pandas/tests/io/data/stata6.csv b/pandas/tests/io/data/stata/stata6.csv similarity index 100% rename from pandas/tests/io/data/stata6.csv rename to pandas/tests/io/data/stata/stata6.csv diff --git a/pandas/tests/io/data/stata6_113.dta b/pandas/tests/io/data/stata/stata6_113.dta similarity index 100% rename from pandas/tests/io/data/stata6_113.dta rename to pandas/tests/io/data/stata/stata6_113.dta diff --git a/pandas/tests/io/data/stata6_114.dta b/pandas/tests/io/data/stata/stata6_114.dta similarity index 100% rename from pandas/tests/io/data/stata6_114.dta rename to pandas/tests/io/data/stata/stata6_114.dta diff --git a/pandas/tests/io/data/stata6_115.dta b/pandas/tests/io/data/stata/stata6_115.dta similarity index 100% rename from pandas/tests/io/data/stata6_115.dta rename to pandas/tests/io/data/stata/stata6_115.dta diff --git a/pandas/tests/io/data/stata6_117.dta b/pandas/tests/io/data/stata/stata6_117.dta similarity index 100% rename from pandas/tests/io/data/stata6_117.dta rename to pandas/tests/io/data/stata/stata6_117.dta diff --git a/pandas/tests/io/data/stata7_111.dta b/pandas/tests/io/data/stata/stata7_111.dta similarity index 100% rename from pandas/tests/io/data/stata7_111.dta rename to pandas/tests/io/data/stata/stata7_111.dta diff --git a/pandas/tests/io/data/stata7_115.dta b/pandas/tests/io/data/stata/stata7_115.dta similarity index 100% rename from pandas/tests/io/data/stata7_115.dta rename to pandas/tests/io/data/stata/stata7_115.dta diff --git a/pandas/tests/io/data/stata7_117.dta b/pandas/tests/io/data/stata/stata7_117.dta similarity index 100% rename from pandas/tests/io/data/stata7_117.dta rename to pandas/tests/io/data/stata/stata7_117.dta diff --git a/pandas/tests/io/data/stata8_113.dta b/pandas/tests/io/data/stata/stata8_113.dta similarity index 100% rename from pandas/tests/io/data/stata8_113.dta rename to pandas/tests/io/data/stata/stata8_113.dta diff --git a/pandas/tests/io/data/stata8_115.dta b/pandas/tests/io/data/stata/stata8_115.dta similarity index 100% rename from pandas/tests/io/data/stata8_115.dta rename to pandas/tests/io/data/stata/stata8_115.dta diff --git a/pandas/tests/io/data/stata8_117.dta b/pandas/tests/io/data/stata/stata8_117.dta similarity index 100% rename from pandas/tests/io/data/stata8_117.dta rename to pandas/tests/io/data/stata/stata8_117.dta diff --git a/pandas/tests/io/data/stata9_115.dta b/pandas/tests/io/data/stata/stata9_115.dta similarity index 100% rename from pandas/tests/io/data/stata9_115.dta rename to pandas/tests/io/data/stata/stata9_115.dta diff --git a/pandas/tests/io/data/stata9_117.dta b/pandas/tests/io/data/stata/stata9_117.dta similarity index 100% rename from pandas/tests/io/data/stata9_117.dta rename to pandas/tests/io/data/stata/stata9_117.dta diff --git a/pandas/tests/io/excel/conftest.py b/pandas/tests/io/excel/conftest.py index 843b3c08421b3..6ec2f477a442d 100644 --- a/pandas/tests/io/excel/conftest.py +++ b/pandas/tests/io/excel/conftest.py @@ -24,11 +24,12 @@ def merge_cells(request): @pytest.fixture -def df_ref(): +def df_ref(datapath): """ Obtain the reference data from read_csv with the Python engine. """ - df_ref = read_csv("test1.csv", index_col=0, parse_dates=True, engine="python") + filepath = datapath("io", "data", "csv", "test1.csv") + df_ref = read_csv(filepath, index_col=0, parse_dates=True, engine="python") return df_ref diff --git a/pandas/tests/io/excel/test_odf.py b/pandas/tests/io/excel/test_odf.py index 47e610562a388..6e5610f4f5838 100644 --- a/pandas/tests/io/excel/test_odf.py +++ b/pandas/tests/io/excel/test_odf.py @@ -13,7 +13,7 @@ def cd_and_set_engine(monkeypatch, datapath): func = functools.partial(pd.read_excel, engine="odf") monkeypatch.setattr(pd, "read_excel", func) - monkeypatch.chdir(datapath("io", "data")) + monkeypatch.chdir(datapath("io", "data", "excel")) def test_read_invalid_types_raises(): diff --git a/pandas/tests/io/excel/test_readers.py b/pandas/tests/io/excel/test_readers.py index 1d3653f685e1e..8c3b9a07641f7 100644 --- a/pandas/tests/io/excel/test_readers.py +++ b/pandas/tests/io/excel/test_readers.py @@ -81,7 +81,7 @@ def cd_and_set_engine(self, engine, datapath, monkeypatch, read_ext): pytest.skip() func = partial(pd.read_excel, engine=engine) - monkeypatch.chdir(datapath("io", "data")) + monkeypatch.chdir(datapath("io", "data", "excel")) monkeypatch.setattr(pd, "read_excel", func) def test_usecols_int(self, read_ext, df_ref): @@ -502,9 +502,11 @@ def test_read_from_http_url(self, read_ext): if read_ext == ".ods": # TODO: remove once on master pytest.skip() + # TODO: alimcmaster1 - revert to master url = ( - "https://raw.github.com/pandas-dev/pandas/master/" - "pandas/tests/io/data/test1" + read_ext + "https://raw.githubusercontent.com/alimcmaster1" + "/pandas/mcmali-tests-dir-struct/" + "pandas/tests/io/data/excel/test1" + read_ext ) url_table = pd.read_excel(url) local_table = pd.read_excel("test1" + read_ext) @@ -527,7 +529,7 @@ def test_read_from_s3_url(self, read_ext, s3_resource): def test_read_from_file_url(self, read_ext, datapath): # FILE - localtable = os.path.join(datapath("io", "data"), "test1" + read_ext) + localtable = os.path.join(datapath("io", "data", "excel"), "test1" + read_ext) local_table = pd.read_excel(localtable) try: @@ -828,7 +830,7 @@ def cd_and_set_engine(self, engine, datapath, monkeypatch, read_ext): pytest.skip() func = partial(pd.ExcelFile, engine=engine) - monkeypatch.chdir(datapath("io", "data")) + monkeypatch.chdir(datapath("io", "data", "excel")) monkeypatch.setattr(pd, "ExcelFile", func) def test_excel_passes_na(self, read_ext): diff --git a/pandas/tests/io/excel/test_xlrd.py b/pandas/tests/io/excel/test_xlrd.py index fc36be9e1b738..e04dfc97d4968 100644 --- a/pandas/tests/io/excel/test_xlrd.py +++ b/pandas/tests/io/excel/test_xlrd.py @@ -35,7 +35,7 @@ def test_read_xlrd_book(read_ext, frame): # TODO: test for openpyxl as well def test_excel_table_sheet_by_index(datapath, read_ext): - path = datapath("io", "data", "test1{}".format(read_ext)) + path = datapath("io", "data", "excel", "test1{}".format(read_ext)) with pd.ExcelFile(path) as excel: with pytest.raises(xlrd.XLRDError): pd.read_excel(excel, "asdf") diff --git a/pandas/tests/io/test_common.py b/pandas/tests/io/test_common.py index 655fd9d01c1c0..2af370a696860 100644 --- a/pandas/tests/io/test_common.py +++ b/pandas/tests/io/test_common.py @@ -208,21 +208,33 @@ def test_read_expands_user_home_dir( @pytest.mark.parametrize( "reader, module, path", [ - (pd.read_csv, "os", ("io", "data", "iris.csv")), - (pd.read_table, "os", ("io", "data", "iris.csv")), - (pd.read_fwf, "os", ("io", "data", "fixed_width_format.txt")), - (pd.read_excel, "xlrd", ("io", "data", "test1.xlsx")), - (pd.read_feather, "feather", ("io", "data", "feather-0_3_1.feather")), + (pd.read_csv, "os", ("data", "iris.csv")), + (pd.read_table, "os", ("data", "iris.csv")), + ( + pd.read_fwf, + "os", + ("io", "data", "fixed_width", "fixed_width_format.txt"), + ), + (pd.read_excel, "xlrd", ("io", "data", "excel", "test1.xlsx")), + ( + pd.read_feather, + "feather", + ("io", "data", "feather", "feather-0_3_1.feather"), + ), ( pd.read_hdf, "tables", ("io", "data", "legacy_hdf", "datetimetz_object.h5"), ), - (pd.read_stata, "os", ("io", "data", "stata10_115.dta")), + (pd.read_stata, "os", ("io", "data", "stata", "stata10_115.dta")), (pd.read_sas, "os", ("io", "sas", "data", "test1.sas7bdat")), (pd.read_json, "os", ("io", "json", "data", "tsframe_v012.json")), (pd.read_msgpack, "os", ("io", "msgpack", "data", "frame.mp")), - (pd.read_pickle, "os", ("io", "data", "categorical.0.25.0.pickle")), + ( + pd.read_pickle, + "os", + ("io", "data", "pickle", "categorical.0.25.0.pickle"), + ), ], ) def test_read_fspath_all(self, reader, module, path, datapath): @@ -296,7 +308,7 @@ def test_write_fspath_hdf5(self): @pytest.fixture def mmap_file(datapath): - return datapath("io", "data", "test_mmap.csv") + return datapath("io", "data", "csv", "test_mmap.csv") class TestMMapWrapper: diff --git a/pandas/tests/io/test_html.py b/pandas/tests/io/test_html.py index 3d855a12d5481..1cd6740ab831a 100644 --- a/pandas/tests/io/test_html.py +++ b/pandas/tests/io/test_html.py @@ -63,7 +63,7 @@ def test_bs4_version_fails(monkeypatch, datapath): monkeypatch.setattr(bs4, "__version__", "4.2") with pytest.raises(ImportError, match="Pandas requires version"): - read_html(datapath("io", "data", "spam.html"), flavor="bs4") + read_html(datapath("io", "data", "html", "spam.html"), flavor="bs4") def test_invalid_flavor(): @@ -78,7 +78,7 @@ def test_invalid_flavor(): @td.skip_if_no("bs4") @td.skip_if_no("lxml") def test_same_ordering(datapath): - filename = datapath("io", "data", "valid_markup.html") + filename = datapath("io", "data", "html", "valid_markup.html") dfs_lxml = read_html(filename, index_col=0, flavor=["lxml"]) dfs_bs4 = read_html(filename, index_col=0, flavor=["bs4"]) assert_framelist_equal(dfs_lxml, dfs_bs4) @@ -95,10 +95,10 @@ def test_same_ordering(datapath): class TestReadHtml: @pytest.fixture(autouse=True) def set_files(self, datapath): - self.spam_data = datapath("io", "data", "spam.html") + self.spam_data = datapath("io", "data", "html", "spam.html") self.spam_data_kwargs = {} self.spam_data_kwargs["encoding"] = "UTF-8" - self.banklist_data = datapath("io", "data", "banklist.html") + self.banklist_data = datapath("io", "data", "html", "banklist.html") @pytest.fixture(autouse=True, scope="function") def set_defaults(self, flavor, request): @@ -133,9 +133,11 @@ def test_banklist_url(self): @tm.network def test_spam_url(self): + # TODO: alimcmaster1 - revert to master url = ( - "https://raw.githubusercontent.com/pandas-dev/pandas/master/" - "pandas/tests/io/data/spam.html" + "https://raw.githubusercontent.com/alimcmaster1/" + "pandas/mcmali-tests-dir-struct/" + "pandas/tests/io/data/html/spam.html" ) df1 = self.read_html(url, ".*Water.*") df2 = self.read_html(url, "Unit") @@ -376,7 +378,7 @@ def test_python_docs_table(self): @pytest.mark.slow def test_thousands_macau_stats(self, datapath): all_non_nan_table_index = -2 - macau_data = datapath("io", "data", "macau.html") + macau_data = datapath("io", "data", "html", "macau.html") dfs = self.read_html(macau_data, index_col=0, attrs={"class": "style1"}) df = dfs[all_non_nan_table_index] @@ -385,7 +387,7 @@ def test_thousands_macau_stats(self, datapath): @pytest.mark.slow def test_thousands_macau_index_col(self, datapath): all_non_nan_table_index = -2 - macau_data = datapath("io", "data", "macau.html") + macau_data = datapath("io", "data", "html", "macau.html") dfs = self.read_html(macau_data, index_col=0, header=0) df = dfs[all_non_nan_table_index] @@ -566,7 +568,7 @@ def test_parse_header_of_non_string_column(self): tm.assert_frame_equal(result, expected) def test_nyse_wsj_commas_table(self, datapath): - data = datapath("io", "data", "nyse_wsj.html") + data = datapath("io", "data", "html", "nyse_wsj.html") df = self.read_html(data, index_col=0, header=0, attrs={"class": "mdcTable"})[0] expected = Index( @@ -594,7 +596,7 @@ def try_remove_ws(x): df = self.read_html(self.banklist_data, "Metcalf", attrs={"id": "table"})[0] ground_truth = read_csv( - datapath("io", "data", "banklist.csv"), + datapath("io", "data", "csv", "banklist.csv"), converters={"Updated Date": Timestamp, "Closing Date": Timestamp}, ) assert df.shape == ground_truth.shape @@ -889,7 +891,7 @@ def test_parse_dates_combine(self): tm.assert_frame_equal(newdf, res[0]) def test_computer_sales_page(self, datapath): - data = datapath("io", "data", "computer_sales_page.html") + data = datapath("io", "data", "html", "computer_sales_page.html") msg = ( r"Passed header=\[0,1\] are too many " r"rows for this multi_index of columns" @@ -897,11 +899,11 @@ def test_computer_sales_page(self, datapath): with pytest.raises(ParserError, match=msg): self.read_html(data, header=[0, 1]) - data = datapath("io", "data", "computer_sales_page.html") + data = datapath("io", "data", "html", "computer_sales_page.html") assert self.read_html(data, header=[1, 2]) def test_wikipedia_states_table(self, datapath): - data = datapath("io", "data", "wikipedia_states.html") + data = datapath("io", "data", "html", "wikipedia_states.html") assert os.path.isfile(data), "{data!r} is not a file".format(data=data) assert os.path.getsize(data), "{data!r} is an empty file".format(data=data) result = self.read_html(data, "Arizona", header=1)[0] @@ -1095,14 +1097,14 @@ def test_multiple_header_rows(self): tm.assert_frame_equal(expected_df, html_df) def test_works_on_valid_markup(self, datapath): - filename = datapath("io", "data", "valid_markup.html") + filename = datapath("io", "data", "html", "valid_markup.html") dfs = self.read_html(filename, index_col=0) assert isinstance(dfs, list) assert isinstance(dfs[0], DataFrame) @pytest.mark.slow def test_fallback_success(self, datapath): - banklist_data = datapath("io", "data", "banklist.html") + banklist_data = datapath("io", "data", "html", "banklist.html") self.read_html(banklist_data, ".*Water.*", flavor=["lxml", "html5lib"]) def test_to_html_timestamp(self): @@ -1240,7 +1242,7 @@ def run(self): # force import check by reinitalising global vars in html.py reload(pandas.io.html) - filename = datapath("io", "data", "valid_markup.html") + filename = datapath("io", "data", "html", "valid_markup.html") helper_thread1 = ErrorThread(target=self.read_html, args=(filename,)) helper_thread2 = ErrorThread(target=self.read_html, args=(filename,)) diff --git a/pandas/tests/io/test_pickle.py b/pandas/tests/io/test_pickle.py index 23a16c885687f..3be966edef080 100644 --- a/pandas/tests/io/test_pickle.py +++ b/pandas/tests/io/test_pickle.py @@ -202,23 +202,25 @@ def test_legacy_sparse_warning(datapath): Generated with >>> df = pd.DataFrame({"A": [1, 2, 3, 4], "B": [0, 0, 1, 1]}).to_sparse() - >>> df.to_pickle("pandas/tests/io/data/sparseframe-0.20.3.pickle.gz", + >>> df.to_pickle("pandas/tests/io/data/pickle/sparseframe-0.20.3.pickle.gz", ... compression="gzip") >>> s = df['B'] - >>> s.to_pickle("pandas/tests/io/data/sparseseries-0.20.3.pickle.gz", + >>> s.to_pickle("pandas/tests/io/data/pickle/sparseseries-0.20.3.pickle.gz", ... compression="gzip") """ with tm.assert_produces_warning(FutureWarning): simplefilter("ignore", DeprecationWarning) # from boto pd.read_pickle( - datapath("io", "data", "sparseseries-0.20.3.pickle.gz"), compression="gzip" + datapath("io", "data", "pickle", "sparseseries-0.20.3.pickle.gz"), + compression="gzip", ) with tm.assert_produces_warning(FutureWarning): simplefilter("ignore", DeprecationWarning) # from boto pd.read_pickle( - datapath("io", "data", "sparseframe-0.20.3.pickle.gz"), compression="gzip" + datapath("io", "data", "pickle", "sparseframe-0.20.3.pickle.gz"), + compression="gzip", ) @@ -382,7 +384,7 @@ def test_read(self, protocol, get_random_path): def test_unicode_decode_error(): # pickle file written with py27, should be readable without raising # UnicodeDecodeError, see GH#28645 - path = os.path.join(os.path.dirname(__file__), "data", "test_py27.pkl") + path = os.path.join(os.path.dirname(__file__), "data", "pickle", "test_py27.pkl") df = pd.read_pickle(path) # just test the columns are correct since the values are random diff --git a/pandas/tests/io/test_spss.py b/pandas/tests/io/test_spss.py index ca84156d104fc..ccf3167d49371 100644 --- a/pandas/tests/io/test_spss.py +++ b/pandas/tests/io/test_spss.py @@ -9,7 +9,7 @@ def test_spss_labelled_num(datapath): # test file from the Haven project (https://haven.tidyverse.org/) - fname = datapath("io", "data", "labelled-num.sav") + fname = datapath("io", "data", "spss", "labelled-num.sav") df = pd.read_spss(fname, convert_categoricals=True) expected = pd.DataFrame({"VAR00002": "This is one"}, index=[0]) @@ -23,7 +23,7 @@ def test_spss_labelled_num(datapath): def test_spss_labelled_num_na(datapath): # test file from the Haven project (https://haven.tidyverse.org/) - fname = datapath("io", "data", "labelled-num-na.sav") + fname = datapath("io", "data", "spss", "labelled-num-na.sav") df = pd.read_spss(fname, convert_categoricals=True) expected = pd.DataFrame({"VAR00002": ["This is one", None]}) @@ -37,7 +37,7 @@ def test_spss_labelled_num_na(datapath): def test_spss_labelled_str(datapath): # test file from the Haven project (https://haven.tidyverse.org/) - fname = datapath("io", "data", "labelled-str.sav") + fname = datapath("io", "data", "spss", "labelled-str.sav") df = pd.read_spss(fname, convert_categoricals=True) expected = pd.DataFrame({"gender": ["Male", "Female"]}) @@ -51,7 +51,7 @@ def test_spss_labelled_str(datapath): def test_spss_umlauts(datapath): # test file from the Haven project (https://haven.tidyverse.org/) - fname = datapath("io", "data", "umlauts.sav") + fname = datapath("io", "data", "spss", "umlauts.sav") df = pd.read_spss(fname, convert_categoricals=True) expected = pd.DataFrame( @@ -67,7 +67,7 @@ def test_spss_umlauts(datapath): def test_spss_usecols(datapath): # usecols must be list-like - fname = datapath("io", "data", "labelled-num.sav") + fname = datapath("io", "data", "spss", "labelled-num.sav") with pytest.raises(TypeError, match="usecols must be list-like."): pd.read_spss(fname, usecols="VAR00002") diff --git a/pandas/tests/io/test_sql.py b/pandas/tests/io/test_sql.py index 183a47c6039ec..1c80dd9e59164 100644 --- a/pandas/tests/io/test_sql.py +++ b/pandas/tests/io/test_sql.py @@ -275,7 +275,7 @@ def _get_exec(self): else: return self.conn.cursor() - @pytest.fixture(params=[("io", "data", "iris.csv")]) + @pytest.fixture(params=[("data", "iris.csv")]) def load_iris_data(self, datapath, request): import io diff --git a/pandas/tests/io/test_stata.py b/pandas/tests/io/test_stata.py index a0ec06a2197ae..7fa3b968278d9 100644 --- a/pandas/tests/io/test_stata.py +++ b/pandas/tests/io/test_stata.py @@ -28,7 +28,7 @@ @pytest.fixture def dirpath(datapath): - return datapath("io", "data") + return datapath("io", "data", "stata") @pytest.fixture @@ -42,7 +42,7 @@ def parsed_114(dirpath): class TestStata: @pytest.fixture(autouse=True) def setup_method(self, datapath): - self.dirpath = datapath("io", "data") + self.dirpath = datapath("io", "data", "stata") self.dta1_114 = os.path.join(self.dirpath, "stata1_114.dta") self.dta1_117 = os.path.join(self.dirpath, "stata1_117.dta") From 0e991116162c3c9a606c9cad7086d7ba49efc638 Mon Sep 17 00:00:00 2001 From: Marc Garcia Date: Wed, 13 Nov 2019 18:45:28 +0100 Subject: [PATCH 040/185] CLN: Removing Python 3.6 or higher references that are always true (#29492) --- pandas/_libs/tslibs/timestamps.pyx | 6 ++-- pandas/compat/__init__.py | 1 - pandas/core/common.py | 13 +------- pandas/core/dtypes/common.py | 12 ++----- pandas/core/frame.py | 33 ++++--------------- pandas/core/generic.py | 8 +---- pandas/core/groupby/generic.py | 8 ----- pandas/core/internals/construction.py | 16 +++------ pandas/core/reshape/concat.py | 2 +- pandas/core/series.py | 9 ----- pandas/io/pickle.py | 6 ++-- pandas/tests/extension/json/test_json.py | 10 ------ pandas/tests/frame/test_constructors.py | 17 ++-------- pandas/tests/frame/test_mutate_columns.py | 26 +-------------- .../tests/groupby/aggregate/test_aggregate.py | 8 ++--- pandas/tests/indexes/test_base.py | 23 +++---------- pandas/tests/indexing/test_indexing.py | 3 +- pandas/tests/internals/test_internals.py | 25 -------------- pandas/tests/io/excel/test_writers.py | 2 -- pandas/tests/io/formats/test_format.py | 7 ++-- pandas/tests/io/json/test_normalize.py | 6 ++-- pandas/tests/io/parser/test_common.py | 4 --- pandas/tests/io/pytables/test_store.py | 3 +- pandas/tests/reshape/test_concat.py | 3 +- .../tests/scalar/timestamp/test_unary_ops.py | 2 -- pandas/tests/series/test_constructors.py | 6 +--- pandas/tests/test_downstream.py | 19 +++-------- 27 files changed, 43 insertions(+), 235 deletions(-) diff --git a/pandas/_libs/tslibs/timestamps.pyx b/pandas/_libs/tslibs/timestamps.pyx index 317dc769636fb..03ed26337d539 100644 --- a/pandas/_libs/tslibs/timestamps.pyx +++ b/pandas/_libs/tslibs/timestamps.pyx @@ -36,7 +36,6 @@ from pandas._libs.tslibs.tzconversion import ( # Constants _zero_time = datetime_time(0, 0) _no_input = object() -PY36 = sys.version_info >= (3, 6) # ---------------------------------------------------------------------- @@ -982,9 +981,8 @@ default 'raise' else: kwargs = {'year': dts.year, 'month': dts.month, 'day': dts.day, 'hour': dts.hour, 'minute': dts.min, 'second': dts.sec, - 'microsecond': dts.us, 'tzinfo': _tzinfo} - if PY36: - kwargs['fold'] = fold + 'microsecond': dts.us, 'tzinfo': _tzinfo, + 'fold': fold} ts_input = datetime(**kwargs) ts = convert_datetime_to_tsobject(ts_input, _tzinfo) diff --git a/pandas/compat/__init__.py b/pandas/compat/__init__.py index 890d0aca0019d..684fbbc23c86c 100644 --- a/pandas/compat/__init__.py +++ b/pandas/compat/__init__.py @@ -12,7 +12,6 @@ import sys import warnings -PY36 = sys.version_info >= (3, 6) PY37 = sys.version_info >= (3, 7) PY38 = sys.version_info >= (3, 8) PYPY = platform.python_implementation() == "PyPy" diff --git a/pandas/core/common.py b/pandas/core/common.py index 565f5076fdddb..133e60de5d694 100644 --- a/pandas/core/common.py +++ b/pandas/core/common.py @@ -5,7 +5,7 @@ """ import collections -from collections import OrderedDict, abc +from collections import abc from datetime import datetime, timedelta from functools import partial import inspect @@ -14,7 +14,6 @@ import numpy as np from pandas._libs import lib, tslibs -from pandas.compat import PY36 from pandas.core.dtypes.cast import construct_1d_object_array_from_listlike from pandas.core.dtypes.common import ( @@ -215,16 +214,6 @@ def try_sort(iterable): return listed -def dict_keys_to_ordered_list(mapping): - # when pandas drops support for Python < 3.6, this function - # can be replaced by a simple list(mapping.keys()) - if PY36 or isinstance(mapping, OrderedDict): - keys = list(mapping.keys()) - else: - keys = try_sort(mapping) - return keys - - def asarray_tuplesafe(values, dtype=None): if not (isinstance(values, (list, tuple)) or hasattr(values, "__array__")): diff --git a/pandas/core/dtypes/common.py b/pandas/core/dtypes/common.py index 1ed54c12f4a34..635bc5ce03680 100644 --- a/pandas/core/dtypes/common.py +++ b/pandas/core/dtypes/common.py @@ -6,7 +6,6 @@ from pandas._libs import algos, lib from pandas._libs.tslibs import conversion -from pandas.compat import PY36 from pandas.core.dtypes.dtypes import ( CategoricalDtype, @@ -1266,9 +1265,6 @@ def _is_unorderable_exception(e: TypeError) -> bool: """ Check if the exception raised is an unorderable exception. - The error message differs for 3 <= PY <= 3.5 and PY >= 3.6, so - we need to condition based on Python version. - Parameters ---------- e : Exception or sub-class @@ -1276,14 +1272,10 @@ def _is_unorderable_exception(e: TypeError) -> bool: Returns ------- - boolean + bool Whether or not the exception raised is an unorderable exception. """ - - if PY36: - return "'>' not supported between instances of" in str(e) - - return "unorderable" in str(e) + return "'>' not supported between instances of" in str(e) def needs_i8_conversion(arr_or_dtype) -> bool: diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 39e325a7e047b..ce74081fb655b 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -34,7 +34,6 @@ from pandas._config import get_option from pandas._libs import algos as libalgos, lib -from pandas.compat import PY36 from pandas.compat.numpy import function as nv from pandas.util._decorators import ( Appender, @@ -3498,16 +3497,12 @@ def assign(self, **kwargs): Notes ----- Assigning multiple columns within the same ``assign`` is possible. - For Python 3.6 and above, later items in '\*\*kwargs' may refer to - newly created or modified columns in 'df'; items are computed and - assigned into 'df' in order. For Python 3.5 and below, the order of - keyword arguments is not specified, you cannot refer to newly created - or modified columns. All items are computed first, and then assigned - in alphabetical order. + Later items in '\*\*kwargs' may refer to newly created or modified + columns in 'df'; items are computed and assigned into 'df' in order. .. versionchanged:: 0.23.0 - Keyword argument order is maintained for Python 3.6 and later. + Keyword argument order is maintained. Examples -------- @@ -3533,9 +3528,8 @@ def assign(self, **kwargs): Portland 17.0 62.6 Berkeley 25.0 77.0 - In Python 3.6+, you can create multiple columns within the same assign - where one of the columns depends on another one defined within the same - assign: + You can create multiple columns within the same assign where one + of the columns depends on another one defined within the same assign: >>> df.assign(temp_f=lambda x: x['temp_c'] * 9 / 5 + 32, ... temp_k=lambda x: (x['temp_f'] + 459.67) * 5 / 9) @@ -3545,21 +3539,8 @@ def assign(self, **kwargs): """ data = self.copy() - # >= 3.6 preserve order of kwargs - if PY36: - for k, v in kwargs.items(): - data[k] = com.apply_if_callable(v, data) - else: - # <= 3.5: do all calculations first... - results = OrderedDict() - for k, v in kwargs.items(): - results[k] = com.apply_if_callable(v, data) - - # <= 3.5 and earlier - results = sorted(results.items()) - # ... and then assign - for k, v in results: - data[k] = v + for k, v in kwargs.items(): + data[k] = com.apply_if_callable(v, data) return data def _sanitize_column(self, key, value, broadcast=True): diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 1b892c02ba7bf..47a0582edbea4 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -8,7 +8,6 @@ import re from textwrap import dedent from typing import ( - TYPE_CHECKING, Any, Callable, Dict, @@ -190,12 +189,7 @@ class NDFrame(PandasObject, SelectionMixin): _metadata = [] # type: List[str] _is_copy = None _data = None # type: BlockManager - - if TYPE_CHECKING: - # TODO(PY36): replace with _attrs : Dict[Hashable, Any] - # We need the TYPE_CHECKING, because _attrs is not a class attribute - # and Py35 doesn't support the new syntax. - _attrs = {} # type: Dict[Optional[Hashable], Any] + _attrs: Dict[Optional[Hashable], Any] # ---------------------------------------------------------------------- # Constructors diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index dda98d2dd438b..8f0b8a1e37af2 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -28,7 +28,6 @@ import numpy as np from pandas._libs import Timestamp, lib -from pandas.compat import PY36 from pandas.util._decorators import Appender, Substitution from pandas.core.dtypes.cast import ( @@ -233,10 +232,6 @@ def aggregate(self, func=None, *args, **kwargs): no_arg_message = "Must provide 'func' or named aggregation **kwargs." if relabeling: columns = list(kwargs) - if not PY36: - # sort for 3.5 and earlier - columns = list(sorted(columns)) - func = [kwargs[col] for col in columns] kwargs = {} if not columns: @@ -1804,9 +1799,6 @@ def _normalize_keyword_aggregation(kwargs): >>> _normalize_keyword_aggregation({'output': ('input', 'sum')}) (OrderedDict([('input', ['sum'])]), ('output',), [('input', 'sum')]) """ - if not PY36: - kwargs = OrderedDict(sorted(kwargs.items())) - # Normalize the aggregation functions as Dict[column, List[func]], # process normally, then fixup the names. # TODO(Py35): When we drop python 3.5, change this to diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py index bb62db431ac73..c24fb0a7dc39a 100644 --- a/pandas/core/internals/construction.py +++ b/pandas/core/internals/construction.py @@ -2,14 +2,12 @@ Functions for preparing various inputs passed to the DataFrame or Series constructors before passing them to a BlockManager. """ -from collections import OrderedDict, abc +from collections import abc import numpy as np import numpy.ma as ma from pandas._libs import lib -import pandas.compat as compat -from pandas.compat import PY36 from pandas.core.dtypes.cast import ( construct_1d_arraylike_from_scalar, @@ -235,7 +233,7 @@ def init_dict(data, index, columns, dtype=None): arrays.loc[missing] = [val] * missing.sum() else: - keys = com.dict_keys_to_ordered_list(data) + keys = list(data.keys()) columns = data_names = Index(keys) arrays = (com.maybe_iterable_to_list(data[k]) for k in keys) # GH#24096 need copy to be deep for datetime64tz case @@ -331,7 +329,6 @@ def extract_index(data): have_raw_arrays = False have_series = False have_dicts = False - have_ordered = False for val in data: if isinstance(val, ABCSeries): @@ -339,8 +336,6 @@ def extract_index(data): indexes.append(val.index) elif isinstance(val, dict): have_dicts = True - if isinstance(val, OrderedDict): - have_ordered = True indexes.append(list(val.keys())) elif is_list_like(val) and getattr(val, "ndim", 1) == 1: have_raw_arrays = True @@ -352,7 +347,7 @@ def extract_index(data): if have_series: index = _union_indexes(indexes) elif have_dicts: - index = _union_indexes(indexes, sort=not (compat.PY36 or have_ordered)) + index = _union_indexes(indexes, sort=False) if have_raw_arrays: lengths = list(set(raw_lengths)) @@ -531,7 +526,7 @@ def _list_of_dict_to_arrays(data, columns, coerce_float=False, dtype=None): """Convert list of dicts to numpy arrays if `columns` is not passed, column names are inferred from the records - - for OrderedDict and (on Python>=3.6) dicts, the column names match + - for OrderedDict and dicts, the column names match the key insertion-order from the first record to the last. - For other kinds of dict-likes, the keys are lexically sorted. @@ -551,8 +546,7 @@ def _list_of_dict_to_arrays(data, columns, coerce_float=False, dtype=None): if columns is None: gen = (list(x.keys()) for x in data) - types = (dict, OrderedDict) if PY36 else OrderedDict - sort = not any(isinstance(d, types) for d in data) + sort = not any(isinstance(d, dict) for d in data) columns = lib.fast_unique_multiple_list_gen(gen, sort=sort) # assure that they are of the base dict class and not of derived diff --git a/pandas/core/reshape/concat.py b/pandas/core/reshape/concat.py index 772ac1cd93059..3c1b2b1eb11d2 100644 --- a/pandas/core/reshape/concat.py +++ b/pandas/core/reshape/concat.py @@ -293,7 +293,7 @@ def __init__( if isinstance(objs, dict): if keys is None: - keys = com.dict_keys_to_ordered_list(objs) + keys = list(objs.keys()) objs = [objs[k] for k in keys] else: objs = list(objs) diff --git a/pandas/core/series.py b/pandas/core/series.py index 7327c2d543836..c5e639fef8c5b 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -1,7 +1,6 @@ """ Data structure for 1-dimensional cross-sectional and time series data """ -from collections import OrderedDict from io import StringIO from shutil import get_terminal_size from textwrap import dedent @@ -13,7 +12,6 @@ from pandas._config import get_option from pandas._libs import index as libindex, lib, reshape, tslibs -from pandas.compat import PY36 from pandas.compat.numpy import function as nv from pandas.util._decorators import Appender, Substitution, deprecate from pandas.util._validators import validate_bool_kwarg, validate_percentile @@ -364,13 +362,6 @@ def _init_dict(self, data, index=None, dtype=None): # Now we just make sure the order is respected, if any if data and index is not None: s = s.reindex(index, copy=False) - elif not PY36 and not isinstance(data, OrderedDict) and data: - # Need the `and data` to avoid sorting Series(None, index=[...]) - # since that isn't really dict-like - try: - s = s.sort_index() - except TypeError: - pass return s._data, s.index @classmethod diff --git a/pandas/io/pickle.py b/pandas/io/pickle.py index 5e066c4f9ecbd..0a0ccedd78f00 100644 --- a/pandas/io/pickle.py +++ b/pandas/io/pickle.py @@ -2,7 +2,7 @@ import pickle import warnings -from pandas.compat import PY36, pickle_compat as pc +from pandas.compat import pickle_compat as pc from pandas.io.common import _get_handle, _stringify_path @@ -140,9 +140,7 @@ def read_pickle(path, compression="infer"): # 1) try standard library Pickle # 2) try pickle_compat (older pandas version) to handle subclass changes - excs_to_catch = (AttributeError, ImportError) - if PY36: - excs_to_catch += (ModuleNotFoundError,) + excs_to_catch = (AttributeError, ImportError, ModuleNotFoundError) try: with warnings.catch_warnings(record=True): diff --git a/pandas/tests/extension/json/test_json.py b/pandas/tests/extension/json/test_json.py index bc75ec6aeb2df..7e027a65eec3a 100644 --- a/pandas/tests/extension/json/test_json.py +++ b/pandas/tests/extension/json/test_json.py @@ -3,8 +3,6 @@ import pytest -from pandas.compat import PY36 - import pandas as pd from pandas.tests.extension import base import pandas.util.testing as tm @@ -180,9 +178,6 @@ def test_fillna_frame(self): unhashable = pytest.mark.skip(reason="Unhashable") -unstable = pytest.mark.skipif( - not PY36, reason="Dictionary order unstable" # 3.6 or higher -) class TestReduce(base.BaseNoReduceTests): @@ -199,20 +194,16 @@ def test_sort_values_frame(self): # TODO (EA.factorize): see if _values_for_factorize allows this. pass - @unstable def test_argsort(self, data_for_sorting): super().test_argsort(data_for_sorting) - @unstable def test_argsort_missing(self, data_missing_for_sorting): super().test_argsort_missing(data_missing_for_sorting) - @unstable @pytest.mark.parametrize("ascending", [True, False]) def test_sort_values(self, data_for_sorting, ascending): super().test_sort_values(data_for_sorting, ascending) - @unstable @pytest.mark.parametrize("ascending", [True, False]) def test_sort_values_missing(self, data_missing_for_sorting, ascending): super().test_sort_values_missing(data_missing_for_sorting, ascending) @@ -280,7 +271,6 @@ def test_groupby_extension_apply(self): we'll be able to dispatch unique. """ - @unstable @pytest.mark.parametrize("as_index", [True, False]) def test_groupby_extension_agg(self, as_index, data_for_grouping): super().test_groupby_extension_agg(as_index, data_for_grouping) diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py index 1d030bbc75521..77a7783deeee3 100644 --- a/pandas/tests/frame/test_constructors.py +++ b/pandas/tests/frame/test_constructors.py @@ -8,7 +8,7 @@ import numpy.ma.mrecords as mrecords import pytest -from pandas.compat import PY36, is_platform_little_endian +from pandas.compat import is_platform_little_endian from pandas.core.dtypes.cast import construct_1d_object_array_from_listlike from pandas.core.dtypes.common import is_integer_dtype @@ -387,7 +387,6 @@ def test_constructor_dict_nan_tuple_key(self, value): result = DataFrame(data, index=idx, columns=cols) tm.assert_frame_equal(result, expected) - @pytest.mark.skipif(not PY36, reason="Insertion order for Python>=3.6") def test_constructor_dict_order_insertion(self): datetime_series = tm.makeTimeSeries(nper=30) datetime_series_short = tm.makeTimeSeries(nper=25) @@ -399,18 +398,6 @@ def test_constructor_dict_order_insertion(self): expected = DataFrame(data=d, columns=list("ba")) tm.assert_frame_equal(frame, expected) - @pytest.mark.skipif(PY36, reason="order by value for Python<3.6") - def test_constructor_dict_order_by_values(self): - datetime_series = tm.makeTimeSeries(nper=30) - datetime_series_short = tm.makeTimeSeries(nper=25) - - # GH19018 - # initialization ordering: by value if python<3.6 - d = {"b": datetime_series_short, "a": datetime_series} - frame = DataFrame(data=d) - expected = DataFrame(data=d, columns=list("ab")) - tm.assert_frame_equal(frame, expected) - def test_constructor_multi_index(self): # GH 4078 # construction error with mi and all-nan frame @@ -1373,7 +1360,7 @@ def test_constructor_list_of_dict_order(self): } ) result = DataFrame(data) - tm.assert_frame_equal(result, expected, check_like=not PY36) + tm.assert_frame_equal(result, expected) def test_constructor_orient(self, float_string_frame): data_dict = float_string_frame.T._series diff --git a/pandas/tests/frame/test_mutate_columns.py b/pandas/tests/frame/test_mutate_columns.py index 7feb55f2fac09..8c0dd67af4e7d 100644 --- a/pandas/tests/frame/test_mutate_columns.py +++ b/pandas/tests/frame/test_mutate_columns.py @@ -3,8 +3,6 @@ import numpy as np import pytest -from pandas.compat import PY36 - from pandas import DataFrame, Index, MultiIndex, Series import pandas.util.testing as tm @@ -60,10 +58,7 @@ def test_assign_order(self): df = DataFrame([[1, 2], [3, 4]], columns=["A", "B"]) result = df.assign(D=df.A + df.B, C=df.A - df.B) - if PY36: - expected = DataFrame([[1, 2, 3, -1], [3, 4, 7, -1]], columns=list("ABDC")) - else: - expected = DataFrame([[1, 2, -1, 3], [3, 4, -1, 7]], columns=list("ABCD")) + expected = DataFrame([[1, 2, 3, -1], [3, 4, 7, -1]], columns=list("ABDC")) tm.assert_frame_equal(result, expected) result = df.assign(C=df.A - df.B, D=df.A + df.B) @@ -80,25 +75,6 @@ def test_assign_bad(self): with pytest.raises(AttributeError): df.assign(C=df.A, D=df.A + df.C) - @pytest.mark.skipif( - PY36, - reason="""Issue #14207: valid for python - 3.6 and above""", - ) - def test_assign_dependent_old_python(self): - df = DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]}) - - # Key C does not exist at definition time of df - with pytest.raises(KeyError, match="^'C'$"): - df.assign(C=lambda df: df.A, D=lambda df: df["A"] + df["C"]) - with pytest.raises(KeyError, match="^'C'$"): - df.assign(C=df.A, D=lambda x: x["A"] + x["C"]) - - @pytest.mark.skipif( - not PY36, - reason="""Issue #14207: not valid for - python 3.5 and below""", - ) def test_assign_dependent(self): df = DataFrame({"A": [1, 2], "B": [3, 4]}) diff --git a/pandas/tests/groupby/aggregate/test_aggregate.py b/pandas/tests/groupby/aggregate/test_aggregate.py index 4313b52798c6e..113c2c6d6d4ac 100644 --- a/pandas/tests/groupby/aggregate/test_aggregate.py +++ b/pandas/tests/groupby/aggregate/test_aggregate.py @@ -8,7 +8,7 @@ import pytest import pandas as pd -from pandas import DataFrame, Index, MultiIndex, Series, compat, concat +from pandas import DataFrame, Index, MultiIndex, Series, concat from pandas.core.base import SpecificationError from pandas.core.groupby.generic import _make_unique, _maybe_mangle_lambdas from pandas.core.groupby.grouper import Grouping @@ -361,9 +361,7 @@ def test_series_named_agg(self): tm.assert_frame_equal(result, expected) result = gr.agg(b="min", a="sum") - # sort for 35 and earlier - if compat.PY36: - expected = expected[["b", "a"]] + expected = expected[["b", "a"]] tm.assert_frame_equal(result, expected) def test_no_args_raises(self): @@ -425,8 +423,6 @@ def test_agg_relabel(self): index=pd.Index(["a", "b"], name="group"), columns=["b_min", "a_min", "a_mean", "a_max", "b_max", "a_98"], ) - if not compat.PY36: - expected = expected[["a_98", "a_max", "a_mean", "a_min", "b_max", "b_min"]] tm.assert_frame_equal(result, expected) def test_agg_relabel_non_identifier(self): diff --git a/pandas/tests/indexes/test_base.py b/pandas/tests/indexes/test_base.py index e43d340a46d9f..fe9953341fdae 100644 --- a/pandas/tests/indexes/test_base.py +++ b/pandas/tests/indexes/test_base.py @@ -11,7 +11,6 @@ import pandas._config.config as cf from pandas._libs.tslib import Timestamp -from pandas.compat import PY36 from pandas.compat.numpy import np_datetime64_compat from pandas.core.dtypes.common import is_unsigned_integer_dtype @@ -1616,11 +1615,7 @@ def test_get_loc(self, method): def test_get_loc_raises_bad_label(self, method): index = pd.Index([0, 1, 2]) if method: - # Messages vary across versions - if PY36: - msg = "not supported between" - else: - msg = "unorderable types" + msg = "not supported between" else: msg = "invalid key" @@ -2444,21 +2439,13 @@ def create_index(self): def test_argsort(self): index = self.create_index() - if PY36: - with pytest.raises(TypeError, match="'>|<' not supported"): - index.argsort() - else: - with pytest.raises(TypeError, match="unorderable types"): - index.argsort() + with pytest.raises(TypeError, match="'>|<' not supported"): + index.argsort() def test_numpy_argsort(self): index = self.create_index() - if PY36: - with pytest.raises(TypeError, match="'>|<' not supported"): - np.argsort(index) - else: - with pytest.raises(TypeError, match="unorderable types"): - np.argsort(index) + with pytest.raises(TypeError, match="'>|<' not supported"): + np.argsort(index) def test_copy_name(self): # Check that "name" argument passed at initialization is honoured diff --git a/pandas/tests/indexing/test_indexing.py b/pandas/tests/indexing/test_indexing.py index d6d3763981131..fc5753ec2955c 100644 --- a/pandas/tests/indexing/test_indexing.py +++ b/pandas/tests/indexing/test_indexing.py @@ -7,7 +7,6 @@ import numpy as np import pytest -from pandas.compat import PY36 from pandas.errors import AbstractMethodError from pandas.core.dtypes.common import is_float_dtype, is_integer_dtype @@ -222,7 +221,7 @@ def test_setitem_dtype_upcast(self): expected = DataFrame( [{"a": 1, "b": np.nan, "c": "foo"}, {"a": 3, "b": 2, "c": np.nan}] ) - tm.assert_frame_equal(df, expected, check_like=not PY36) + tm.assert_frame_equal(df, expected) # GH10280 df = DataFrame( diff --git a/pandas/tests/internals/test_internals.py b/pandas/tests/internals/test_internals.py index ee7fca6ec7672..fa7a98c617677 100644 --- a/pandas/tests/internals/test_internals.py +++ b/pandas/tests/internals/test_internals.py @@ -1,10 +1,8 @@ from collections import OrderedDict from datetime import date, datetime -from distutils.version import LooseVersion import itertools import operator import re -import sys import numpy as np import pytest @@ -26,9 +24,6 @@ from pandas.core.internals import BlockManager, SingleBlockManager, make_block import pandas.util.testing as tm -# in 3.6.1 a c-api slicing function changed, see src/compat_helper.h -PY361 = LooseVersion(sys.version) >= LooseVersion("3.6.1") - @pytest.fixture def mgr(): @@ -1096,10 +1091,6 @@ def assert_as_slice_equals(arr, slc): assert_as_slice_equals([2, 1], slice(2, 0, -1)) - if not PY361: - assert_as_slice_equals([2, 1, 0], slice(2, None, -1)) - assert_as_slice_equals([100, 0], slice(100, None, -100)) - def test_not_slice_like_arrays(self): def assert_not_slice_like(arr): assert not BlockPlacement(arr).is_slice_like @@ -1119,10 +1110,6 @@ def test_slice_iter(self): assert list(BlockPlacement(slice(0, 0))) == [] assert list(BlockPlacement(slice(3, 0))) == [] - if not PY361: - assert list(BlockPlacement(slice(3, 0, -1))) == [3, 2, 1] - assert list(BlockPlacement(slice(3, None, -1))) == [3, 2, 1, 0] - def test_slice_to_array_conversion(self): def assert_as_array_equals(slc, asarray): tm.assert_numpy_array_equal( @@ -1135,10 +1122,6 @@ def assert_as_array_equals(slc, asarray): assert_as_array_equals(slice(3, 0, -1), [3, 2, 1]) - if not PY361: - assert_as_array_equals(slice(3, None, -1), [3, 2, 1, 0]) - assert_as_array_equals(slice(31, None, -10), [31, 21, 11, 1]) - def test_blockplacement_add(self): bpl = BlockPlacement(slice(0, 5)) assert bpl.add(1).as_slice == slice(1, 6, 1) @@ -1168,14 +1151,6 @@ def assert_add_equals(val, inc, result): with pytest.raises(ValueError): BlockPlacement([1, 2, 4]).add(-10) - if not PY361: - assert_add_equals(slice(3, 0, -1), -1, [2, 1, 0]) - assert_add_equals(slice(2, None, -1), 0, [2, 1, 0]) - assert_add_equals(slice(2, None, -1), 10, [12, 11, 10]) - - with pytest.raises(ValueError): - BlockPlacement(slice(2, None, -1)).add(-1) - class DummyElement: def __init__(self, value, dtype): diff --git a/pandas/tests/io/excel/test_writers.py b/pandas/tests/io/excel/test_writers.py index 1bc4ad3e7867a..a7730e079a1bb 100644 --- a/pandas/tests/io/excel/test_writers.py +++ b/pandas/tests/io/excel/test_writers.py @@ -6,7 +6,6 @@ import numpy as np import pytest -from pandas.compat import PY36 import pandas.util._test_decorators as td import pandas as pd @@ -1262,7 +1261,6 @@ def check_called(func): @td.skip_if_no("xlrd") @td.skip_if_no("openpyxl") -@pytest.mark.skipif(not PY36, reason="requires fspath") class TestFSPath: def test_excelfile_fspath(self): with tm.ensure_clean("foo.xlsx") as path: diff --git a/pandas/tests/io/formats/test_format.py b/pandas/tests/io/formats/test_format.py index 704de378b0909..0f4a7a33dd115 100644 --- a/pandas/tests/io/formats/test_format.py +++ b/pandas/tests/io/formats/test_format.py @@ -18,7 +18,7 @@ import pytest import pytz -from pandas.compat import PY36, is_platform_32bit, is_platform_windows +from pandas.compat import is_platform_32bit, is_platform_windows import pandas as pd from pandas import ( @@ -62,10 +62,7 @@ def filepath_or_buffer(filepath_or_buffer_id, tmp_path): yield buf assert not buf.closed else: - if PY36: - assert isinstance(tmp_path, Path) - else: - assert hasattr(tmp_path, "__fspath__") + assert isinstance(tmp_path, Path) if filepath_or_buffer_id == "pathlike": yield tmp_path / "foo" else: diff --git a/pandas/tests/io/json/test_normalize.py b/pandas/tests/io/json/test_normalize.py index a3ca61cb1eb63..c71c52bce87b8 100644 --- a/pandas/tests/io/json/test_normalize.py +++ b/pandas/tests/io/json/test_normalize.py @@ -3,8 +3,6 @@ import numpy as np import pytest -from pandas.compat import PY36 - from pandas import DataFrame, Index import pandas.util.testing as tm @@ -382,7 +380,7 @@ def test_missing_field(self, author_missing_data): }, ] expected = DataFrame(ex_data) - tm.assert_frame_equal(result, expected, check_like=not PY36) + tm.assert_frame_equal(result, expected) @pytest.mark.parametrize( "max_level,expected", @@ -524,7 +522,7 @@ def test_missing_meta(self, missing_metadata): columns = ["city", "number", "state", "street", "zip", "name"] columns = ["number", "street", "city", "state", "zip", "name"] expected = DataFrame(ex_data, columns=columns) - tm.assert_frame_equal(result, expected, check_like=not PY36) + tm.assert_frame_equal(result, expected) def test_donot_drop_nonevalues(self): # GH21356 diff --git a/pandas/tests/io/parser/test_common.py b/pandas/tests/io/parser/test_common.py index 6e6c31bc5b972..590f26a76802a 100644 --- a/pandas/tests/io/parser/test_common.py +++ b/pandas/tests/io/parser/test_common.py @@ -2160,10 +2160,6 @@ def test_suppress_error_output(all_parsers, capsys): assert captured.err == "" -@pytest.mark.skipif( - compat.is_platform_windows() and not compat.PY36, - reason="On Python < 3.6 won't pass on Windows", -) @pytest.mark.parametrize("filename", ["sé-es-vé.csv", "ru-sй.csv", "中文文件名.csv"]) def test_filename_with_special_chars(all_parsers, filename): # see gh-15086. diff --git a/pandas/tests/io/pytables/test_store.py b/pandas/tests/io/pytables/test_store.py index f9d525399bde3..a43da75005a65 100644 --- a/pandas/tests/io/pytables/test_store.py +++ b/pandas/tests/io/pytables/test_store.py @@ -9,7 +9,7 @@ import numpy as np import pytest -from pandas.compat import PY36, is_platform_little_endian, is_platform_windows +from pandas.compat import is_platform_little_endian, is_platform_windows import pandas.util._test_decorators as td from pandas.core.dtypes.common import is_categorical_dtype @@ -4711,7 +4711,6 @@ def test_read_hdf_series_mode_r(self, format, setup_path): result = pd.read_hdf(path, key="data", mode="r") tm.assert_series_equal(result, series) - @pytest.mark.skipif(not PY36, reason="Need python 3.6") def test_fspath(self): with tm.ensure_clean("foo.h5") as path: with pd.HDFStore(path) as store: diff --git a/pandas/tests/reshape/test_concat.py b/pandas/tests/reshape/test_concat.py index 46dafbc4e1ec8..323b3126c2461 100644 --- a/pandas/tests/reshape/test_concat.py +++ b/pandas/tests/reshape/test_concat.py @@ -27,7 +27,6 @@ isna, read_csv, ) -import pandas.core.common as com from pandas.tests.extension.decimal import to_decimal import pandas.util.testing as tm @@ -1264,7 +1263,7 @@ def test_concat_dict(self): "qux": DataFrame(np.random.randn(4, 3)), } - sorted_keys = com.dict_keys_to_ordered_list(frames) + sorted_keys = list(frames.keys()) result = concat(frames) expected = concat([frames[k] for k in sorted_keys], keys=sorted_keys) diff --git a/pandas/tests/scalar/timestamp/test_unary_ops.py b/pandas/tests/scalar/timestamp/test_unary_ops.py index dffb957b8f3b0..db63e0bf9cd30 100644 --- a/pandas/tests/scalar/timestamp/test_unary_ops.py +++ b/pandas/tests/scalar/timestamp/test_unary_ops.py @@ -7,7 +7,6 @@ from pandas._libs.tslibs import conversion from pandas._libs.tslibs.frequencies import INVALID_FREQ_ERR_MSG -from pandas.compat import PY36 import pandas.util._test_decorators as td from pandas import NaT, Timestamp @@ -375,7 +374,6 @@ def test_replace_dst_border(self): expected = Timestamp("2013-11-3 03:00:00", tz="America/Chicago") assert result == expected - @pytest.mark.skipif(not PY36, reason="Fold not available until PY3.6") @pytest.mark.parametrize("fold", [0, 1]) @pytest.mark.parametrize("tz", ["dateutil/Europe/London", "Europe/London"]) def test_replace_dst_fold(self, fold, tz): diff --git a/pandas/tests/series/test_constructors.py b/pandas/tests/series/test_constructors.py index 4cbade2669cc6..34b11a0d008aa 100644 --- a/pandas/tests/series/test_constructors.py +++ b/pandas/tests/series/test_constructors.py @@ -7,7 +7,6 @@ from pandas._libs import lib from pandas._libs.tslib import iNaT -from pandas.compat import PY36 from pandas.core.dtypes.common import is_categorical_dtype, is_datetime64tz_dtype from pandas.core.dtypes.dtypes import CategoricalDtype, ordered_sentinel @@ -1048,10 +1047,7 @@ def test_constructor_dict_order(self): # order by value d = {"b": 1, "a": 0, "c": 2} result = Series(d) - if PY36: - expected = Series([1, 0, 2], index=list("bac")) - else: - expected = Series([0, 1, 2], index=list("abc")) + expected = Series([1, 0, 2], index=list("bac")) tm.assert_series_equal(result, expected) @pytest.mark.parametrize("value", [2, np.nan, None, float("nan")]) diff --git a/pandas/tests/test_downstream.py b/pandas/tests/test_downstream.py index fb0511f8902f7..dc88ebe1f7f8e 100644 --- a/pandas/tests/test_downstream.py +++ b/pandas/tests/test_downstream.py @@ -8,8 +8,6 @@ import numpy as np # noqa import pytest -from pandas.compat import PY36 - from pandas import DataFrame, Series import pandas.util.testing as tm @@ -18,19 +16,10 @@ def import_module(name): # we *only* want to skip if the module is truly not available # and NOT just an actual import error because of pandas changes - if PY36: - try: - return importlib.import_module(name) - except ModuleNotFoundError: # noqa - pytest.skip("skipping as {} not available".format(name)) - - else: - try: - return importlib.import_module(name) - except ImportError as e: - if "No module named" in str(e) and name in str(e): - pytest.skip("skipping as {} not available".format(name)) - raise + try: + return importlib.import_module(name) + except ModuleNotFoundError: # noqa + pytest.skip("skipping as {} not available".format(name)) @pytest.fixture From 966757faa28cc536b1bca4856f1dc693ec0bd2ea Mon Sep 17 00:00:00 2001 From: Karthigeyan Date: Thu, 14 Nov 2019 13:00:13 +0530 Subject: [PATCH 041/185] CLN: noqa removal (#29574) --- asv_bench/benchmarks/gil.py | 2 +- asv_bench/benchmarks/offset.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/asv_bench/benchmarks/gil.py b/asv_bench/benchmarks/gil.py index d57492dd37268..43c8594b8c8df 100644 --- a/asv_bench/benchmarks/gil.py +++ b/asv_bench/benchmarks/gil.py @@ -37,7 +37,7 @@ def wrapper(fname): return wrapper -from .pandas_vb_common import BaseIO # noqa: E402 isort:skip +from .pandas_vb_common import BaseIO # isort:skip class ParallelGroupbyMethods: diff --git a/asv_bench/benchmarks/offset.py b/asv_bench/benchmarks/offset.py index a960f43f46acd..77ce1b2763bce 100644 --- a/asv_bench/benchmarks/offset.py +++ b/asv_bench/benchmarks/offset.py @@ -3,7 +3,7 @@ import pandas as pd try: - import pandas.tseries.holiday # noqa + import pandas.tseries.holiday except ImportError: pass From 8a7accdfcfcf36b0875654c946204007ac830153 Mon Sep 17 00:00:00 2001 From: alimcmaster1 Date: Thu, 14 Nov 2019 12:50:22 +0000 Subject: [PATCH 042/185] Remove ref to branch (#29603) --- pandas/tests/io/excel/test_readers.py | 6 +----- pandas/tests/io/test_html.py | 4 +--- 2 files changed, 2 insertions(+), 8 deletions(-) diff --git a/pandas/tests/io/excel/test_readers.py b/pandas/tests/io/excel/test_readers.py index 8c3b9a07641f7..70a86c2233513 100644 --- a/pandas/tests/io/excel/test_readers.py +++ b/pandas/tests/io/excel/test_readers.py @@ -499,13 +499,9 @@ def test_bad_engine_raises(self, read_ext): @tm.network def test_read_from_http_url(self, read_ext): - if read_ext == ".ods": # TODO: remove once on master - pytest.skip() - # TODO: alimcmaster1 - revert to master url = ( - "https://raw.githubusercontent.com/alimcmaster1" - "/pandas/mcmali-tests-dir-struct/" + "https://raw.githubusercontent.com/pandas-dev/pandas/master/" "pandas/tests/io/data/excel/test1" + read_ext ) url_table = pd.read_excel(url) diff --git a/pandas/tests/io/test_html.py b/pandas/tests/io/test_html.py index 1cd6740ab831a..d8d617ceeebff 100644 --- a/pandas/tests/io/test_html.py +++ b/pandas/tests/io/test_html.py @@ -133,10 +133,8 @@ def test_banklist_url(self): @tm.network def test_spam_url(self): - # TODO: alimcmaster1 - revert to master url = ( - "https://raw.githubusercontent.com/alimcmaster1/" - "pandas/mcmali-tests-dir-struct/" + "https://raw.githubusercontent.com/pandas-dev/pandas/master/" "pandas/tests/io/data/html/spam.html" ) df1 = self.read_html(url, ".*Water.*") From c5766c1bfd5220c12b8c244f34f1f53a16dba1cc Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Thu, 14 Nov 2019 04:54:41 -0800 Subject: [PATCH 043/185] CLN: follow-ups (#29600) --- pandas/_libs/reduction.pyx | 29 ++++++++++++----------------- pandas/_libs/tslibs/c_timestamp.pyx | 12 ++++++------ pandas/_libs/tslibs/nattype.pyx | 8 ++++---- pandas/_libs/tslibs/timedeltas.pyx | 16 ++++++++-------- pandas/core/groupby/ops.py | 3 +-- pandas/core/reshape/merge.py | 4 ++-- 6 files changed, 33 insertions(+), 39 deletions(-) diff --git a/pandas/_libs/reduction.pyx b/pandas/_libs/reduction.pyx index 11dc2d04bb74e..eadb8003beba3 100644 --- a/pandas/_libs/reduction.pyx +++ b/pandas/_libs/reduction.pyx @@ -82,11 +82,9 @@ cdef class Reducer: else: # we passed a series-like - if hasattr(dummy, 'values'): - - typ = type(dummy) - index = getattr(dummy, 'index', None) - dummy = dummy.values + typ = type(dummy) + index = dummy.index + dummy = dummy.values if dummy.dtype != self.arr.dtype: raise ValueError('Dummy array must be same dtype') @@ -99,10 +97,10 @@ cdef class Reducer: cdef: char* dummy_buf ndarray arr, result, chunk - Py_ssize_t i, incr + Py_ssize_t i flatiter it bint has_labels - object res, name, labels, index + object res, name, labels object cached_typ = None arr = self.arr @@ -112,7 +110,6 @@ cdef class Reducer: labels = self.labels has_labels = labels is not None has_index = self.index is not None - incr = self.increment result = np.empty(self.nresults, dtype='O') it = PyArray_IterNew(result) @@ -193,10 +190,10 @@ cdef class _BaseGrouper: return values, index cdef inline _update_cached_objs(self, object cached_typ, object cached_ityp, - Slider islider, Slider vslider, object name): + Slider islider, Slider vslider): if cached_typ is None: cached_ityp = self.ityp(islider.buf) - cached_typ = self.typ(vslider.buf, index=cached_ityp, name=name) + cached_typ = self.typ(vslider.buf, index=cached_ityp, name=self.name) else: # See the comment in indexes/base.py about _index_data. # We need this for EA-backed indexes that have a reference @@ -205,7 +202,7 @@ cdef class _BaseGrouper: cached_ityp._engine.clear_mapping() object.__setattr__(cached_typ._data._block, 'values', vslider.buf) object.__setattr__(cached_typ, '_index', cached_ityp) - object.__setattr__(cached_typ, 'name', name) + object.__setattr__(cached_typ, 'name', self.name) return cached_typ, cached_ityp @@ -254,7 +251,7 @@ cdef class SeriesBinGrouper(_BaseGrouper): object res bint initialized = 0 Slider vslider, islider - object name, cached_typ = None, cached_ityp = None + object cached_typ = None, cached_ityp = None counts = np.zeros(self.ngroups, dtype=np.int64) @@ -268,7 +265,6 @@ cdef class SeriesBinGrouper(_BaseGrouper): group_size = 0 n = len(self.arr) - name = self.name vslider = Slider(self.arr, self.dummy_arr) islider = Slider(self.index, self.dummy_index) @@ -283,7 +279,7 @@ cdef class SeriesBinGrouper(_BaseGrouper): vslider.set_length(group_size) cached_typ, cached_ityp = self._update_cached_objs( - cached_typ, cached_ityp, islider, vslider, name) + cached_typ, cached_ityp, islider, vslider) cached_ityp._engine.clear_mapping() res = self.f(cached_typ) @@ -356,13 +352,12 @@ cdef class SeriesGrouper(_BaseGrouper): object res bint initialized = 0 Slider vslider, islider - object name, cached_typ = None, cached_ityp = None + object cached_typ = None, cached_ityp = None labels = self.labels counts = np.zeros(self.ngroups, dtype=np.int64) group_size = 0 n = len(self.arr) - name = self.name vslider = Slider(self.arr, self.dummy_arr) islider = Slider(self.index, self.dummy_index) @@ -386,7 +381,7 @@ cdef class SeriesGrouper(_BaseGrouper): vslider.set_length(group_size) cached_typ, cached_ityp = self._update_cached_objs( - cached_typ, cached_ityp, islider, vslider, name) + cached_typ, cached_ityp, islider, vslider) cached_ityp._engine.clear_mapping() res = self.f(cached_typ) diff --git a/pandas/_libs/tslibs/c_timestamp.pyx b/pandas/_libs/tslibs/c_timestamp.pyx index 8e4143a053ba3..8512b34b9e78c 100644 --- a/pandas/_libs/tslibs/c_timestamp.pyx +++ b/pandas/_libs/tslibs/c_timestamp.pyx @@ -201,7 +201,7 @@ cdef class _Timestamp(datetime): """ return np.datetime64(self.value, 'ns') - def to_numpy(self, dtype=None, copy=False): + def to_numpy(self, dtype=None, copy=False) -> np.datetime64: """ Convert the Timestamp to a NumPy datetime64. @@ -369,18 +369,18 @@ cdef class _Timestamp(datetime): return out[0] @property - def _repr_base(self): + def _repr_base(self) -> str: return '{date} {time}'.format(date=self._date_repr, time=self._time_repr) @property - def _date_repr(self): + def _date_repr(self) -> str: # Ideal here would be self.strftime("%Y-%m-%d"), but # the datetime strftime() methods require year >= 1900 return '%d-%.2d-%.2d' % (self.year, self.month, self.day) @property - def _time_repr(self): + def _time_repr(self) -> str: result = '%.2d:%.2d:%.2d' % (self.hour, self.minute, self.second) if self.nanosecond != 0: @@ -391,7 +391,7 @@ cdef class _Timestamp(datetime): return result @property - def _short_repr(self): + def _short_repr(self) -> str: # format a Timestamp with only _date_repr if possible # otherwise _repr_base if (self.hour == 0 and @@ -403,7 +403,7 @@ cdef class _Timestamp(datetime): return self._repr_base @property - def asm8(self): + def asm8(self) -> np.datetime64: """ Return numpy datetime64 format in nanoseconds. """ diff --git a/pandas/_libs/tslibs/nattype.pyx b/pandas/_libs/tslibs/nattype.pyx index e491d6111a919..3ddce28fb6dd1 100644 --- a/pandas/_libs/tslibs/nattype.pyx +++ b/pandas/_libs/tslibs/nattype.pyx @@ -230,16 +230,16 @@ cdef class _NaT(datetime): return NotImplemented @property - def asm8(self): + def asm8(self) -> np.datetime64: return np.datetime64(NPY_NAT, 'ns') - def to_datetime64(self): + def to_datetime64(self) -> np.datetime64: """ Return a numpy.datetime64 object with 'ns' precision. """ return np.datetime64('NaT', 'ns') - def to_numpy(self, dtype=None, copy=False): + def to_numpy(self, dtype=None, copy=False) -> np.datetime64: """ Convert the Timestamp to a NumPy datetime64. @@ -265,7 +265,7 @@ cdef class _NaT(datetime): def __str__(self) -> str: return 'NaT' - def isoformat(self, sep='T'): + def isoformat(self, sep='T') -> str: # This allows Timestamp(ts.isoformat()) to always correctly roundtrip. return 'NaT' diff --git a/pandas/_libs/tslibs/timedeltas.pyx b/pandas/_libs/tslibs/timedeltas.pyx index 9d8ed62388655..21dbdfbb111ed 100644 --- a/pandas/_libs/tslibs/timedeltas.pyx +++ b/pandas/_libs/tslibs/timedeltas.pyx @@ -841,15 +841,15 @@ cdef class _Timedelta(timedelta): """ return timedelta(microseconds=int(self.value) / 1000) - def to_timedelta64(self): + def to_timedelta64(self) -> np.timedelta64: """ Return a numpy.timedelta64 object with 'ns' precision. """ return np.timedelta64(self.value, 'ns') - def to_numpy(self, dtype=None, copy=False): + def to_numpy(self, dtype=None, copy=False) -> np.timedelta64: """ - Convert the Timestamp to a NumPy timedelta64. + Convert the Timedelta to a NumPy timedelta64. .. versionadded:: 0.25.0 @@ -920,7 +920,7 @@ cdef class _Timedelta(timedelta): return self.value @property - def asm8(self): + def asm8(self) -> np.timedelta64: """ Return a numpy timedelta64 array scalar view. @@ -955,7 +955,7 @@ cdef class _Timedelta(timedelta): return np.int64(self.value).view('m8[ns]') @property - def resolution_string(self): + def resolution_string(self) -> str: """ Return a string representing the lowest timedelta resolution. @@ -1095,7 +1095,7 @@ cdef class _Timedelta(timedelta): self._ensure_components() return self._ns - def _repr_base(self, format=None): + def _repr_base(self, format=None) -> str: """ Parameters @@ -1148,10 +1148,10 @@ cdef class _Timedelta(timedelta): def __str__(self) -> str: return self._repr_base(format='long') - def __bool__(self): + def __bool__(self) -> bool: return self.value != 0 - def isoformat(self): + def isoformat(self) -> str: """ Format Timedelta as ISO 8601 Duration like ``P[n]Y[n]M[n]DT[n]H[n]M[n]S``, where the ``[n]`` s are replaced by the diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index 390fe60ea02b4..2d5576761c3a0 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -830,8 +830,7 @@ def agg_series(self, obj: Series, func): assert self.ngroups != 0 if is_extension_array_dtype(obj.dtype): - # pre-empty SeriesBinGrouper from raising TypeError - # TODO: watch out, this can return None + # pre-empt SeriesBinGrouper from raising TypeError return self._aggregate_series_pure_python(obj, func) dummy = obj[:0] diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index 2674b7ee95088..76c4b328eb4db 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -565,8 +565,8 @@ def __init__( ): _left = _validate_operand(left) _right = _validate_operand(right) - self.left = self.orig_left = _validate_operand(_left) # type: "DataFrame" - self.right = self.orig_right = _validate_operand(_right) # type: "DataFrame" + self.left = self.orig_left = _left + self.right = self.orig_right = _right self.how = how self.axis = axis From db4095ec3f26c30892fe07989fa78b25ae4ca56b Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Thu, 14 Nov 2019 04:55:25 -0800 Subject: [PATCH 044/185] Check instead of try/except (#29587) --- pandas/core/base.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/base.py b/pandas/core/base.py index e070005c56d7a..1c74c977e39bc 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -602,9 +602,9 @@ def _aggregate_multiple_funcs(self, arg, _level, _axis): if not len(results): raise ValueError("no results") - try: + if all(np.ndim(x) > 0 for x in results): return concat(results, keys=keys, axis=1, sort=False) - except TypeError: + else: # we are concatting non-NDFrame objects, # e.g. a list of scalars From d1173de59906f2394987a1384422d770901c717d Mon Sep 17 00:00:00 2001 From: Yash Shukla <29108833+yashukla@users.noreply.github.com> Date: Thu, 14 Nov 2019 06:56:27 -0600 Subject: [PATCH 045/185] CLN: F-string formatting in pandas/tests/indexes/*.py (#29547) (#29579) --- pandas/tests/indexes/test_base.py | 10 +++++----- pandas/tests/indexes/test_category.py | 4 ++-- pandas/tests/indexes/test_common.py | 7 ++++--- pandas/tests/indexes/test_numeric.py | 8 ++++---- 4 files changed, 15 insertions(+), 14 deletions(-) diff --git a/pandas/tests/indexes/test_base.py b/pandas/tests/indexes/test_base.py index fe9953341fdae..90f8fbc5faef2 100644 --- a/pandas/tests/indexes/test_base.py +++ b/pandas/tests/indexes/test_base.py @@ -1832,7 +1832,7 @@ def test_drop_tuple(self, values, to_drop): tm.assert_index_equal(result, expected) removed = index.drop(to_drop[1]) - msg = r"\"\[{}\] not found in axis\"".format(re.escape(to_drop[1].__repr__())) + msg = fr"\"\[{re.escape(to_drop[1].__repr__())}\] not found in axis\"" for drop_me in to_drop[1], [to_drop[1]]: with pytest.raises(KeyError, match=msg): removed.drop(drop_me) @@ -2000,11 +2000,11 @@ def test_isin_level_kwarg_bad_label_raises(self, label, indices): index = indices if isinstance(index, MultiIndex): index = index.rename(["foo", "bar"]) - msg = "'Level {} not found'" + msg = f"'Level {label} not found'" else: index = index.rename("foo") - msg = r"Requested level \({}\) does not match index name \(foo\)" - with pytest.raises(KeyError, match=msg.format(label)): + msg = fr"Requested level \({label}\) does not match index name \(foo\)" + with pytest.raises(KeyError, match=msg): index.isin([], level=label) @pytest.mark.parametrize("empty", [[], Series(), np.array([])]) @@ -2755,7 +2755,7 @@ def test_generated_op_names(opname, indices): # pd.Index.__rsub__ does not exist; though the method does exist # for subclasses. see GH#19723 return - opname = "__{name}__".format(name=opname) + opname = f"__{opname}__" method = getattr(indices, opname) assert method.__name__ == opname diff --git a/pandas/tests/indexes/test_category.py b/pandas/tests/indexes/test_category.py index 61d9d1d70c360..84f98a55376f7 100644 --- a/pandas/tests/indexes/test_category.py +++ b/pandas/tests/indexes/test_category.py @@ -188,8 +188,8 @@ def test_disallow_set_ops(self, func, op_name): # GH 10039 # set ops (+/-) raise TypeError idx = pd.Index(pd.Categorical(["a", "b"])) - msg = "cannot perform {} with this index type: CategoricalIndex" - with pytest.raises(TypeError, match=msg.format(op_name)): + msg = f"cannot perform {op_name} with this index type: CategoricalIndex" + with pytest.raises(TypeError, match=msg): func(idx) def test_method_delegation(self): diff --git a/pandas/tests/indexes/test_common.py b/pandas/tests/indexes/test_common.py index ae1a21e9b3980..558ba04b657a1 100644 --- a/pandas/tests/indexes/test_common.py +++ b/pandas/tests/indexes/test_common.py @@ -167,7 +167,7 @@ def test_dtype_str(self, indices): def test_hash_error(self, indices): index = indices with pytest.raises( - TypeError, match=("unhashable type: {0.__name__!r}".format(type(index))) + TypeError, match=(f"unhashable type: {type(index).__name__!r}") ): hash(indices) @@ -201,8 +201,9 @@ def test_unique(self, indices): with pytest.raises(IndexError, match=msg): indices.unique(level=3) - msg = r"Requested level \(wrong\) does not match index name \({}\)".format( - re.escape(indices.name.__repr__()) + msg = ( + fr"Requested level \(wrong\) does not match index name " + fr"\({re.escape(indices.name.__repr__())}\)" ) with pytest.raises(KeyError, match=msg): indices.unique(level="wrong") diff --git a/pandas/tests/indexes/test_numeric.py b/pandas/tests/indexes/test_numeric.py index e424b3601a4b2..deb63cc9ef854 100644 --- a/pandas/tests/indexes/test_numeric.py +++ b/pandas/tests/indexes/test_numeric.py @@ -245,9 +245,9 @@ def test_astype(self, mixed_index, float_index): # invalid for dtype in ["M8[ns]", "m8[ns]"]: msg = ( - "Cannot convert Float64Index to dtype {}; integer values" - " are required for conversion" - ).format(pandas_dtype(dtype)) + f"Cannot convert Float64Index to dtype {pandas_dtype(dtype)}; " + f"integer values are required for conversion" + ) with pytest.raises(TypeError, match=re.escape(msg)): i.astype(dtype) @@ -588,7 +588,7 @@ def test_take_fill_value(self): tm.assert_index_equal(result, expected) name = self._holder.__name__ - msg = "Unable to fill values because {name} cannot contain NA".format(name=name) + msg = f"Unable to fill values because {name} cannot contain NA" # fill_value=True with pytest.raises(ValueError, match=msg): From 2deb2da98fe831408f9ffba555784108f0496acd Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Thu, 14 Nov 2019 05:00:16 -0800 Subject: [PATCH 046/185] CLN: annotations and docstrings in pd.io (#29605) --- pandas/io/clipboard/__init__.py | 4 +- pandas/io/date_converters.py | 4 +- pandas/io/excel/_odfreader.py | 2 +- pandas/io/excel/_openpyxl.py | 32 ++++-- pandas/io/excel/_xlwt.py | 5 +- pandas/io/feather_format.py | 2 +- pandas/io/formats/csvs.py | 2 +- pandas/io/formats/excel.py | 8 +- pandas/io/formats/latex.py | 23 ++-- pandas/io/html.py | 15 +-- pandas/io/json/_json.py | 4 +- pandas/io/pytables.py | 189 ++++++++++++++++++-------------- pandas/io/sas/sas_xport.py | 8 +- 13 files changed, 172 insertions(+), 126 deletions(-) diff --git a/pandas/io/clipboard/__init__.py b/pandas/io/clipboard/__init__.py index 63dd40a229dfc..40452b41998df 100644 --- a/pandas/io/clipboard/__init__.py +++ b/pandas/io/clipboard/__init__.py @@ -95,8 +95,8 @@ def _stringifyText(text): acceptedTypes = (str, int, float, bool) if not isinstance(text, acceptedTypes): raise PyperclipException( - "only str, int, float, and bool values" - "can be copied to the clipboard, not".format(text.__class__.__name__) + f"only str, int, float, and bool values" + f"can be copied to the clipboard, not {text.__class__.__name__}" ) return str(text) diff --git a/pandas/io/date_converters.py b/pandas/io/date_converters.py index ab64bc14344f1..7fdca2d65b05d 100644 --- a/pandas/io/date_converters.py +++ b/pandas/io/date_converters.py @@ -57,8 +57,8 @@ def _check_columns(cols): for i, n in enumerate(map(len, tail)): if n != N: raise AssertionError( - "All columns must have the same length: {0}; " - "column {1} has length {2}".format(N, i, n) + f"All columns must have the same length: {N}; " + f"column {i} has length {n}" ) return N diff --git a/pandas/io/excel/_odfreader.py b/pandas/io/excel/_odfreader.py index 3a67f8306fff1..97556f9685001 100644 --- a/pandas/io/excel/_odfreader.py +++ b/pandas/io/excel/_odfreader.py @@ -61,7 +61,7 @@ def get_sheet_by_name(self, name: str): if table.getAttribute("name") == name: return table - raise ValueError("sheet {} not found".format(name)) + raise ValueError(f"sheet {name} not found") def get_sheet_data(self, sheet, convert_float: bool) -> List[List[Scalar]]: """Parse an ODF Table into a list of lists diff --git a/pandas/io/excel/_openpyxl.py b/pandas/io/excel/_openpyxl.py index d8f5da5ab5bc6..d0d6096a4425e 100644 --- a/pandas/io/excel/_openpyxl.py +++ b/pandas/io/excel/_openpyxl.py @@ -46,7 +46,8 @@ def save(self): @classmethod def _convert_to_style(cls, style_dict): """ - converts a style_dict to an openpyxl style object + Converts a style_dict to an openpyxl style object. + Parameters ---------- style_dict : style dictionary to convert @@ -72,7 +73,8 @@ def _convert_to_style(cls, style_dict): def _convert_to_style_kwargs(cls, style_dict): """ Convert a style_dict to a set of kwargs suitable for initializing - or updating-on-copy an openpyxl v2 style object + or updating-on-copy an openpyxl v2 style object. + Parameters ---------- style_dict : dict @@ -83,6 +85,7 @@ def _convert_to_style_kwargs(cls, style_dict): 'alignment' 'number_format' 'protection' + Returns ------- style_kwargs : dict @@ -107,7 +110,8 @@ def _convert_to_style_kwargs(cls, style_dict): @classmethod def _convert_to_color(cls, color_spec): """ - Convert ``color_spec`` to an openpyxl v2 Color object + Convert ``color_spec`` to an openpyxl v2 Color object. + Parameters ---------- color_spec : str, dict @@ -120,6 +124,7 @@ def _convert_to_color(cls, color_spec): 'tint' 'index' 'type' + Returns ------- color : openpyxl.styles.Color @@ -135,7 +140,8 @@ def _convert_to_color(cls, color_spec): @classmethod def _convert_to_font(cls, font_dict): """ - Convert ``font_dict`` to an openpyxl v2 Font object + Convert ``font_dict`` to an openpyxl v2 Font object. + Parameters ---------- font_dict : dict @@ -154,6 +160,7 @@ def _convert_to_font(cls, font_dict): 'outline' 'shadow' 'condense' + Returns ------- font : openpyxl.styles.Font @@ -185,11 +192,13 @@ def _convert_to_stop(cls, stop_seq): """ Convert ``stop_seq`` to a list of openpyxl v2 Color objects, suitable for initializing the ``GradientFill`` ``stop`` parameter. + Parameters ---------- stop_seq : iterable An iterable that yields objects suitable for consumption by ``_convert_to_color``. + Returns ------- stop : list of openpyxl.styles.Color @@ -200,7 +209,8 @@ def _convert_to_stop(cls, stop_seq): @classmethod def _convert_to_fill(cls, fill_dict): """ - Convert ``fill_dict`` to an openpyxl v2 Fill object + Convert ``fill_dict`` to an openpyxl v2 Fill object. + Parameters ---------- fill_dict : dict @@ -216,6 +226,7 @@ def _convert_to_fill(cls, fill_dict): 'top' 'bottom' 'stop' + Returns ------- fill : openpyxl.styles.Fill @@ -262,7 +273,8 @@ def _convert_to_fill(cls, fill_dict): @classmethod def _convert_to_side(cls, side_spec): """ - Convert ``side_spec`` to an openpyxl v2 Side object + Convert ``side_spec`` to an openpyxl v2 Side object. + Parameters ---------- side_spec : str, dict @@ -270,6 +282,7 @@ def _convert_to_side(cls, side_spec): of the following keys (or their synonyms). 'style' ('border_style') 'color' + Returns ------- side : openpyxl.styles.Side @@ -295,7 +308,8 @@ def _convert_to_side(cls, side_spec): @classmethod def _convert_to_border(cls, border_dict): """ - Convert ``border_dict`` to an openpyxl v2 Border object + Convert ``border_dict`` to an openpyxl v2 Border object. + Parameters ---------- border_dict : dict @@ -311,6 +325,7 @@ def _convert_to_border(cls, border_dict): 'diagonalUp' ('diagonalup') 'diagonalDown' ('diagonaldown') 'outline' + Returns ------- border : openpyxl.styles.Border @@ -335,7 +350,8 @@ def _convert_to_border(cls, border_dict): @classmethod def _convert_to_alignment(cls, alignment_dict): """ - Convert ``alignment_dict`` to an openpyxl v2 Alignment object + Convert ``alignment_dict`` to an openpyxl v2 Alignment object. + Parameters ---------- alignment_dict : dict diff --git a/pandas/io/excel/_xlwt.py b/pandas/io/excel/_xlwt.py index fe3d0a208de6a..996ae1caa14c8 100644 --- a/pandas/io/excel/_xlwt.py +++ b/pandas/io/excel/_xlwt.py @@ -77,7 +77,9 @@ def write_cells( wks.write(startrow + cell.row, startcol + cell.col, val, style) @classmethod - def _style_to_xlwt(cls, item, firstlevel=True, field_sep=",", line_sep=";"): + def _style_to_xlwt( + cls, item, firstlevel: bool = True, field_sep=",", line_sep=";" + ) -> str: """helper which recursively generate an xlwt easy style string for example: @@ -117,6 +119,7 @@ def _style_to_xlwt(cls, item, firstlevel=True, field_sep=",", line_sep=";"): def _convert_to_style(cls, style_dict, num_format_str=None): """ converts a style_dict to an xlwt style object + Parameters ---------- style_dict : style dictionary to convert diff --git a/pandas/io/feather_format.py b/pandas/io/feather_format.py index dd6519275ad15..d9e88f42c2ef2 100644 --- a/pandas/io/feather_format.py +++ b/pandas/io/feather_format.py @@ -10,7 +10,7 @@ from pandas.io.common import _stringify_path -def to_feather(df, path): +def to_feather(df: DataFrame, path): """ Write a DataFrame to the feather-format diff --git a/pandas/io/formats/csvs.py b/pandas/io/formats/csvs.py index e25862537cbfc..f0493036b934a 100644 --- a/pandas/io/formats/csvs.py +++ b/pandas/io/formats/csvs.py @@ -327,7 +327,7 @@ def _save(self): self._save_chunk(start_i, end_i) - def _save_chunk(self, start_i, end_i): + def _save_chunk(self, start_i: int, end_i: int): data_index = self.data_index diff --git a/pandas/io/formats/excel.py b/pandas/io/formats/excel.py index b9c847ad64c57..cd0889044094f 100644 --- a/pandas/io/formats/excel.py +++ b/pandas/io/formats/excel.py @@ -63,8 +63,9 @@ def __init__(self, inherited=None): compute_css = CSSResolver() - def __call__(self, declarations_str): - """Convert CSS declarations to ExcelWriter style + def __call__(self, declarations_str: str): + """ + Convert CSS declarations to ExcelWriter style. Parameters ---------- @@ -279,6 +280,7 @@ def build_font(self, props): if "text-shadow" in props else None ), + # FIXME: dont leave commented-out # 'vertAlign':, # 'charset': , # 'scheme': , @@ -665,7 +667,7 @@ def _format_hierarchical_rows(self): for cell in self._generate_body(gcolidx): yield cell - def _generate_body(self, coloffset): + def _generate_body(self, coloffset: int): if self.styler is None: styles = None else: diff --git a/pandas/io/formats/latex.py b/pandas/io/formats/latex.py index ca9db88ae7be4..6f903e770c86c 100644 --- a/pandas/io/formats/latex.py +++ b/pandas/io/formats/latex.py @@ -11,8 +11,8 @@ class LatexFormatter(TableFormatter): - """ Used to render a DataFrame to a LaTeX tabular/longtable environment - output. + """ + Used to render a DataFrame to a LaTeX tabular/longtable environment output. Parameters ---------- @@ -106,18 +106,19 @@ def pad_empties(x): # Get rid of old multiindex column and add new ones strcols = out + strcols[1:] - column_format = self.column_format - if column_format is None: + if self.column_format is None: dtypes = self.frame.dtypes._values column_format = "".join(map(get_col_type, dtypes)) if self.fmt.index: index_format = "l" * self.frame.index.nlevels column_format = index_format + column_format - elif not isinstance(column_format, str): # pragma: no cover + elif not isinstance(self.column_format, str): # pragma: no cover raise AssertionError( "column_format must be str or unicode, " "not {typ}".format(typ=type(column_format)) ) + else: + column_format = self.column_format if self.longtable: self._write_longtable_begin(buf, column_format) @@ -265,7 +266,7 @@ def _format_multirow( def _print_cline(self, buf: IO[str], i: int, icol: int) -> None: """ - Print clines after multirow-blocks are finished + Print clines after multirow-blocks are finished. """ for cl in self.clinebuf: if cl[0] == i: @@ -273,7 +274,7 @@ def _print_cline(self, buf: IO[str], i: int, icol: int) -> None: # remove entries that have been written to buffer self.clinebuf = [x for x in self.clinebuf if x[0] != i] - def _write_tabular_begin(self, buf, column_format): + def _write_tabular_begin(self, buf, column_format: str): """ Write the beginning of a tabular environment or nested table/tabular environments including caption and label. @@ -283,11 +284,10 @@ def _write_tabular_begin(self, buf, column_format): buf : string or file handle File path or object. If not specified, the result is returned as a string. - column_format : str, default None + column_format : str The columns format as specified in `LaTeX table format `__ e.g 'rcl' for 3 columns - """ if self.caption is not None or self.label is not None: # then write output in a nested table/tabular environment @@ -327,7 +327,7 @@ def _write_tabular_end(self, buf): else: pass - def _write_longtable_begin(self, buf, column_format): + def _write_longtable_begin(self, buf, column_format: str): """ Write the beginning of a longtable environment including caption and label if provided by user. @@ -337,11 +337,10 @@ def _write_longtable_begin(self, buf, column_format): buf : string or file handle File path or object. If not specified, the result is returned as a string. - column_format : str, default None + column_format : str The columns format as specified in `LaTeX table format `__ e.g 'rcl' for 3 columns - """ buf.write("\\begin{{longtable}}{{{fmt}}}\n".format(fmt=column_format)) diff --git a/pandas/io/html.py b/pandas/io/html.py index 9a368907b65aa..ed2b21994fdca 100644 --- a/pandas/io/html.py +++ b/pandas/io/html.py @@ -57,7 +57,7 @@ def _importers(): _RE_WHITESPACE = re.compile(r"[\r\n]+|\s{2,}") -def _remove_whitespace(s, regex=_RE_WHITESPACE): +def _remove_whitespace(s: str, regex=_RE_WHITESPACE) -> str: """ Replace extra whitespace inside of a string with a single space. @@ -65,8 +65,7 @@ def _remove_whitespace(s, regex=_RE_WHITESPACE): ---------- s : str or unicode The string from which to remove extra whitespace. - - regex : regex + regex : re.Pattern The regular expression to use to remove extra whitespace. Returns @@ -253,7 +252,8 @@ def _text_getter(self, obj): raise AbstractMethodError(self) def _parse_td(self, obj): - """Return the td elements from a row element. + """ + Return the td elements from a row element. Parameters ---------- @@ -600,7 +600,7 @@ def _build_doc(self): ) -def _build_xpath_expr(attrs): +def _build_xpath_expr(attrs) -> str: """Build an xpath expression to simulate bs4's ability to pass in kwargs to search for attributes when using the lxml parser. @@ -810,7 +810,8 @@ def _data_to_frame(**kwargs): def _parser_dispatch(flavor): - """Choose the parser based on the input flavor. + """ + Choose the parser based on the input flavor. Parameters ---------- @@ -850,7 +851,7 @@ def _parser_dispatch(flavor): return _valid_parsers[flavor] -def _print_as_set(s): +def _print_as_set(s) -> str: return "{" + "{arg}".format(arg=", ".join(pprint_thing(el) for el in s)) + "}" diff --git a/pandas/io/json/_json.py b/pandas/io/json/_json.py index 0a8f275cf54a9..26a3248262f9a 100644 --- a/pandas/io/json/_json.py +++ b/pandas/io/json/_json.py @@ -711,7 +711,7 @@ def _get_data_from_filepath(self, filepath_or_buffer): return data - def _combine_lines(self, lines): + def _combine_lines(self, lines) -> str: """ Combines a list of JSON objects into one JSON object. """ @@ -1169,7 +1169,7 @@ def _try_convert_dates(self): convert_dates = [] convert_dates = set(convert_dates) - def is_ok(col): + def is_ok(col) -> bool: """ Return if this col is ok to try for a date parse. """ diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index ee08e2abb2289..03eb8570e436e 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -92,7 +92,7 @@ def _ensure_str(name): Term = Expr -def _ensure_term(where, scope_level): +def _ensure_term(where, scope_level: int): """ ensure that the where is a Term or a list of Term this makes sure that we are capturing the scope of variables @@ -252,7 +252,7 @@ def to_hdf( complevel=None, complib=None, append=None, - **kwargs + **kwargs, ): """ store this object, close it if we opened it """ @@ -271,7 +271,7 @@ def to_hdf( f(path_or_buf) -def read_hdf(path_or_buf, key=None, mode="r", **kwargs): +def read_hdf(path_or_buf, key=None, mode: str = "r", **kwargs): """ Read from the store, close it if we opened it. @@ -340,8 +340,8 @@ def read_hdf(path_or_buf, key=None, mode="r", **kwargs): if mode not in ["r", "r+", "a"]: raise ValueError( - "mode {0} is not allowed while performing a read. " - "Allowed modes are r, r+ and a.".format(mode) + f"mode {mode} is not allowed while performing a read. " + f"Allowed modes are r, r+ and a." ) # grab the scope if "where" in kwargs: @@ -406,7 +406,7 @@ def read_hdf(path_or_buf, key=None, mode="r", **kwargs): raise -def _is_metadata_of(group, parent_group): +def _is_metadata_of(group, parent_group) -> bool: """Check if a given group is a metadata group for a given parent_group.""" if group._v_depth <= parent_group._v_depth: return False @@ -466,7 +466,13 @@ class HDFStore: """ def __init__( - self, path, mode=None, complevel=None, complib=None, fletcher32=False, **kwargs + self, + path, + mode=None, + complevel=None, + complib=None, + fletcher32: bool = False, + **kwargs, ): if "format" in kwargs: @@ -577,7 +583,7 @@ def items(self): iteritems = items - def open(self, mode="a", **kwargs): + def open(self, mode: str = "a", **kwargs): """ Open the file in the specified mode @@ -615,19 +621,19 @@ def open(self, mode="a", **kwargs): try: self._handle = tables.open_file(self._path, self._mode, **kwargs) - except (IOError) as e: # pragma: no cover - if "can not be written" in str(e): + except IOError as err: # pragma: no cover + if "can not be written" in str(err): print("Opening {path} in read-only mode".format(path=self._path)) self._handle = tables.open_file(self._path, "r", **kwargs) else: raise - except (ValueError) as e: + except ValueError as err: # trap PyTables >= 3.1 FILE_OPEN_POLICY exception # to provide an updated message - if "FILE_OPEN_POLICY" in str(e): - e = ValueError( + if "FILE_OPEN_POLICY" in str(err): + err = ValueError( "PyTables [{version}] no longer supports opening multiple " "files\n" "even in read-only mode on this HDF5 version " @@ -641,14 +647,14 @@ def open(self, mode="a", **kwargs): ) ) - raise e + raise err - except (Exception) as e: + except Exception as err: # trying to read from a non-existent file causes an error which # is not part of IOError, make it one - if self._mode == "r" and "Unable to open/create file" in str(e): - raise IOError(str(e)) + if self._mode == "r" and "Unable to open/create file" in str(err): + raise IOError(str(err)) raise def close(self): @@ -660,7 +666,7 @@ def close(self): self._handle = None @property - def is_open(self): + def is_open(self) -> bool: """ return a boolean indicating whether the file is open """ @@ -668,7 +674,7 @@ def is_open(self): return False return bool(self._handle.isopen) - def flush(self, fsync=False): + def flush(self, fsync: bool = False): """ Force all buffered modifications to be written to disk. @@ -719,8 +725,8 @@ def select( columns=None, iterator=False, chunksize=None, - auto_close=False, - **kwargs + auto_close: bool = False, + **kwargs, ): """ Retrieve pandas object stored in file, optionally based on where criteria. @@ -824,10 +830,11 @@ def select_as_multiple( stop=None, iterator=False, chunksize=None, - auto_close=False, - **kwargs + auto_close: bool = False, + **kwargs, ): - """ Retrieve pandas objects from multiple tables + """ + Retrieve pandas objects from multiple tables. Parameters ---------- @@ -839,6 +846,8 @@ def select_as_multiple( stop : integer (defaults to None), row number to stop selection iterator : boolean, return an iterator, default False chunksize : nrows to include in iteration, return an iterator + auto_close : bool, default False + Should automatically close the store when finished. Raises ------ @@ -860,7 +869,7 @@ def select_as_multiple( stop=stop, iterator=iterator, chunksize=chunksize, - **kwargs + **kwargs, ) if not isinstance(keys, (list, tuple)): @@ -1262,27 +1271,28 @@ def copy( self, file, mode="w", - propindexes=True, + propindexes: bool = True, keys=None, complib=None, complevel=None, - fletcher32=False, + fletcher32: bool = False, overwrite=True, ): - """ copy the existing store to a new file, upgrading in place - - Parameters - ---------- - propindexes: restore indexes in copied file (defaults to True) - keys : list of keys to include in the copy (defaults to all) - overwrite : overwrite (remove and replace) existing nodes in the - new store (default is True) - mode, complib, complevel, fletcher32 same as in HDFStore.__init__ + """ + Copy the existing store to a new file, updating in place. - Returns - ------- - open file handle of the new store + Parameters + ---------- + propindexes: bool, default True + Restore indexes in copied file. + keys : list of keys to include in the copy (defaults to all) + overwrite : overwrite (remove and replace) existing nodes in the + new store (default is True) + mode, complib, complevel, fletcher32 same as in HDFStore.__init__ + Returns + ------- + open file handle of the new store """ new_store = HDFStore( file, mode=mode, complib=complib, complevel=complevel, fletcher32=fletcher32 @@ -1302,7 +1312,7 @@ def copy( data = self.select(k) if s.is_table: - index = False + index = False # type: Union[bool, list] if propindexes: index = [a.name for a in s.axes if a.is_indexed] new_store.append( @@ -1317,7 +1327,7 @@ def copy( return new_store - def info(self): + def info(self) -> str: """ Print detailed information on the store. @@ -1478,7 +1488,7 @@ def _write_to_group( append=False, complib=None, encoding=None, - **kwargs + **kwargs, ): group = self.get_node(key) @@ -1550,13 +1560,16 @@ class TableIterator: nrows : the rows to iterate on start : the passed start value (default is None) stop : the passed stop value (default is None) - iterator : boolean, whether to use the default iterator + iterator : bool, default False + Whether to use the default iterator. chunksize : the passed chunking value (default is 50000) auto_close : boolean, automatically close the store at the end of iteration, default is False kwargs : the passed kwargs """ + chunksize: Optional[int] + def __init__( self, store, @@ -1566,7 +1579,7 @@ def __init__( nrows, start=None, stop=None, - iterator=False, + iterator: bool = False, chunksize=None, auto_close=False, ): @@ -1619,7 +1632,7 @@ def close(self): if self.auto_close: self.store.close() - def get_result(self, coordinates=False): + def get_result(self, coordinates: bool = False): # return the actual iterator if self.chunksize is not None: @@ -1676,7 +1689,7 @@ def __init__( freq=None, tz=None, index_name=None, - **kwargs + **kwargs, ): self.values = values self.kind = kind @@ -1708,13 +1721,13 @@ def set_name(self, name, kind_attr=None): return self - def set_axis(self, axis): + def set_axis(self, axis: int): """ set the axis over which I index """ self.axis = axis return self - def set_pos(self, pos): + def set_pos(self, pos: int): """ set the position of this column in the Table """ self.pos = pos if pos is not None and self.typ is not None: @@ -1736,23 +1749,23 @@ def __repr__(self) -> str: ) ) - def __eq__(self, other): + def __eq__(self, other) -> bool: """ compare 2 col items """ return all( getattr(self, a, None) == getattr(other, a, None) for a in ["name", "cname", "axis", "pos"] ) - def __ne__(self, other): + def __ne__(self, other) -> bool: return not self.__eq__(other) @property - def is_indexed(self): + def is_indexed(self) -> bool: """ return whether I am an indexed column """ try: return getattr(self.table.cols, self.cname).is_indexed except AttributeError: - False + return False def copy(self): new_self = copy.copy(self) @@ -1767,7 +1780,9 @@ def infer(self, handler): new_self.read_metadata(handler) return new_self - def convert(self, values, nan_rep, encoding, errors, start=None, stop=None): + def convert( + self, values: np.ndarray, nan_rep, encoding, errors, start=None, stop=None + ): """ set the values from this selection: take = take ownership """ # values is a recarray @@ -1961,7 +1976,7 @@ class GenericIndexCol(IndexCol): """ an index which is not represented in the data of the table """ @property - def is_indexed(self): + def is_indexed(self) -> bool: return False def convert(self, values, nan_rep, encoding, errors, start=None, stop=None): @@ -2042,7 +2057,7 @@ def __init__( meta=None, metadata=None, block=None, - **kwargs + **kwargs, ): super().__init__(values=values, kind=kind, typ=typ, cname=cname, **kwargs) self.dtype = None @@ -2497,7 +2512,7 @@ def __init__(self, parent, group, encoding=None, errors="strict", **kwargs): self.set_version() @property - def is_old_version(self): + def is_old_version(self) -> bool: return self.version[0] <= 0 and self.version[1] <= 10 and self.version[2] < 1 def set_version(self): @@ -2515,7 +2530,7 @@ def pandas_type(self): return _ensure_decoded(getattr(self.group._v_attrs, "pandas_type", None)) @property - def format_type(self): + def format_type(self) -> str: return "fixed" def __repr__(self) -> str: @@ -2590,7 +2605,7 @@ def storable(self): return self.group @property - def is_exists(self): + def is_exists(self) -> bool: return False @property @@ -2647,7 +2662,7 @@ class GenericFixed(Fixed): attributes = [] # type: List[str] # indexer helpders - def _class_to_alias(self, cls): + def _class_to_alias(self, cls) -> str: return self._index_type_map.get(cls, "") def _alias_to_class(self, alias): @@ -2700,7 +2715,7 @@ def validate_read(self, kwargs): return kwargs @property - def is_exists(self): + def is_exists(self) -> bool: return True def set_attrs(self): @@ -2908,14 +2923,14 @@ def read_index_node(self, node, start=None, stop=None): data, kind, encoding=self.encoding, errors=self.errors ), dtype=object, - **kwargs + **kwargs, ) else: index = factory( _unconvert_index( data, kind, encoding=self.encoding, errors=self.errors ), - **kwargs + **kwargs, ) index.name = name @@ -2931,7 +2946,7 @@ def write_array_empty(self, key, value): getattr(self.group, key)._v_attrs.value_type = str(value.dtype) getattr(self.group, key)._v_attrs.shape = value.shape - def _is_empty_array(self, shape): + def _is_empty_array(self, shape) -> bool: """Returns true if any axis is zero length.""" return any(x == 0 for x in shape) @@ -3210,7 +3225,7 @@ def table_type_short(self): return self.table_type.split("_")[0] @property - def format_type(self): + def format_type(self) -> str: return "table" def __repr__(self) -> str: @@ -3309,7 +3324,7 @@ def nrows_expected(self): return np.prod([i.cvalues.shape[0] for i in self.index_axes]) @property - def is_exists(self): + def is_exists(self) -> bool: """ has this table been created """ return "table" in self.group @@ -3335,12 +3350,12 @@ def axes(self): return itertools.chain(self.index_axes, self.values_axes) @property - def ncols(self): + def ncols(self) -> int: """ the number of total columns in the values axes """ return sum(len(a.values) for a in self.values_axes) @property - def is_transposed(self): + def is_transposed(self) -> bool: return False @property @@ -3378,7 +3393,7 @@ def values_cols(self): """ return a list of my values cols """ return [i.cname for i in self.values_axes] - def _get_metadata_path(self, key): + def _get_metadata_path(self, key) -> str: """ return the metadata pathname for this key """ return "{group}/meta/{key}/meta".format(group=self.group._v_pathname, key=key) @@ -3572,9 +3587,19 @@ def create_index(self, columns=None, optlevel=None, kind=None): ) v.create_index(**kw) - def read_axes(self, where, **kwargs): - """create and return the axes sniffed from the table: return boolean - for success + def read_axes(self, where, **kwargs) -> bool: + """ + Create the axes sniffed from the table. + + Parameters + ---------- + where : ??? + **kwargs + + Returns + ------- + bool + Indicates success. """ # validate the version @@ -3654,7 +3679,7 @@ def create_axes( nan_rep=None, data_columns=None, min_itemsize=None, - **kwargs + **kwargs, ): """ create and return the axes legacy tables create an indexable column, indexable index, @@ -3941,7 +3966,7 @@ def process_filter(field, filt): return obj def create_description( - self, complib=None, complevel=None, fletcher32=False, expectedrows=None + self, complib=None, complevel=None, fletcher32: bool = False, expectedrows=None ): """ create the description of the table from the axes & values """ @@ -4104,7 +4129,7 @@ def write( chunksize=None, expectedrows=None, dropna=False, - **kwargs + **kwargs, ): if not append and self.is_exists: @@ -4340,7 +4365,7 @@ class AppendableFrameTable(AppendableTable): obj_type = DataFrame # type: Type[Union[DataFrame, Series]] @property - def is_transposed(self): + def is_transposed(self) -> bool: return self.index_axes[0].axis == 1 def get_object(self, obj): @@ -4411,7 +4436,7 @@ class AppendableSeriesTable(AppendableFrameTable): storage_obj_type = DataFrame @property - def is_transposed(self): + def is_transposed(self) -> bool: return False def get_object(self, obj): @@ -4547,7 +4572,7 @@ def read(self, **kwargs): return df -def _reindex_axis(obj, axis, labels, other=None): +def _reindex_axis(obj, axis: int, labels: Index, other=None): ax = obj._get_axis(axis) labels = ensure_index(labels) @@ -4562,7 +4587,7 @@ def _reindex_axis(obj, axis, labels, other=None): if other is not None: labels = ensure_index(other.unique()).intersection(labels, sort=False) if not labels.equals(ax): - slicer = [slice(None, None)] * obj.ndim + slicer = [slice(None, None)] * obj.ndim # type: List[Union[slice, Index]] slicer[axis] = labels obj = obj.loc[tuple(slicer)] return obj @@ -4588,7 +4613,7 @@ def _get_tz(tz): return zone -def _set_tz(values, tz, preserve_UTC=False, coerce=False): +def _set_tz(values, tz, preserve_UTC: bool = False, coerce: bool = False): """ coerce the values to a DatetimeIndex if tz is set preserve the input shape if possible @@ -4597,7 +4622,7 @@ def _set_tz(values, tz, preserve_UTC=False, coerce=False): ---------- values : ndarray tz : string/pickled tz object - preserve_UTC : boolean, + preserve_UTC : bool, preserve the UTC of the result coerce : if we do not have a passed timezone, coerce to M8[ns] ndarray """ @@ -4842,7 +4867,7 @@ def _unconvert_string_array(data, nan_rep=None, encoding=None, errors="strict"): return data.reshape(shape) -def _maybe_convert(values, val_kind, encoding, errors): +def _maybe_convert(values: np.ndarray, val_kind, encoding, errors): if _need_convert(val_kind): conv = _get_converter(val_kind, encoding, errors) # conv = np.frompyfunc(conv, 1, 1) @@ -4862,7 +4887,7 @@ def _get_converter(kind, encoding, errors): raise ValueError("invalid kind {kind}".format(kind=kind)) -def _need_convert(kind): +def _need_convert(kind) -> bool: kind = _ensure_decoded(kind) if kind in ("datetime", "datetime64", "string"): return True diff --git a/pandas/io/sas/sas_xport.py b/pandas/io/sas/sas_xport.py index ea26a9b8efdbf..2f2dbdbc76215 100644 --- a/pandas/io/sas/sas_xport.py +++ b/pandas/io/sas/sas_xport.py @@ -143,7 +143,7 @@ """ -def _parse_date(datestr): +def _parse_date(datestr: str) -> datetime: """ Given a date in xport format, return Python date. """ try: # e.g. "16FEB11:10:07:55" @@ -152,11 +152,11 @@ def _parse_date(datestr): return pd.NaT -def _split_line(s, parts): +def _split_line(s: str, parts): """ Parameters ---------- - s: string + s: str Fixed-length string to split parts: list of (name, length) pairs Used to break up string, name '_' will be filtered from output. @@ -402,7 +402,7 @@ def _read_header(self): def __next__(self): return self.read(nrows=self._chunksize or 1) - def _record_count(self): + def _record_count(self) -> int: """ Get number of records in file. From bceac8eb717729bf7771b40c572a5dc4d3674f23 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Thu, 14 Nov 2019 05:26:30 -0800 Subject: [PATCH 047/185] CLN: no longer need to catch AttributeError, IndexError (#29591) --- pandas/_libs/reduction.pyx | 1 + pandas/core/groupby/generic.py | 5 +---- pandas/core/groupby/ops.py | 5 +++++ pandas/core/resample.py | 1 + 4 files changed, 8 insertions(+), 4 deletions(-) diff --git a/pandas/_libs/reduction.pyx b/pandas/_libs/reduction.pyx index eadb8003beba3..3df26cbcf214a 100644 --- a/pandas/_libs/reduction.pyx +++ b/pandas/_libs/reduction.pyx @@ -221,6 +221,7 @@ cdef class SeriesBinGrouper(_BaseGrouper): def __init__(self, object series, object f, object bins, object dummy): assert dummy is not None # always obj[:0] + assert len(bins) > 0 # otherwise we get IndexError in get_result self.bins = bins self.f = f diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 8f0b8a1e37af2..dec9a33c73f54 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -257,10 +257,7 @@ def aggregate(self, func=None, *args, **kwargs): try: return self._python_agg_general(func, *args, **kwargs) - except (ValueError, KeyError, AttributeError, IndexError): - # TODO: IndexError can be removed here following GH#29106 - # TODO: AttributeError is caused by _index_data hijinx in - # libreduction, can be removed after GH#29160 + except (ValueError, KeyError): # TODO: KeyError is raised in _python_agg_general, # see see test_groupby.test_basic result = self._aggregate_named(func, *args, **kwargs) diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index 2d5576761c3a0..754d67d329538 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -721,6 +721,10 @@ def __init__( self.mutated = mutated self.indexer = indexer + # These lengths must match, otherwise we could call agg_series + # with empty self.bins, which would raise in libreduction. + assert len(self.binlabels) == len(self.bins) + @cache_readonly def groups(self): """ dict {group name -> group labels} """ @@ -828,6 +832,7 @@ def groupings(self) -> "List[grouper.Grouping]": def agg_series(self, obj: Series, func): # Caller is responsible for checking ngroups != 0 assert self.ngroups != 0 + assert len(self.bins) > 0 # otherwise we'd get IndexError in get_result if is_extension_array_dtype(obj.dtype): # pre-empt SeriesBinGrouper from raising TypeError diff --git a/pandas/core/resample.py b/pandas/core/resample.py index d980d5ba0be6e..79c808cb3951c 100644 --- a/pandas/core/resample.py +++ b/pandas/core/resample.py @@ -187,6 +187,7 @@ def _get_binner(self): """ binner, bins, binlabels = self._get_binner_for_time() + assert len(bins) == len(binlabels) bin_grouper = BinGrouper(bins, binlabels, indexer=self.groupby.indexer) return binner, bin_grouper From d691ec01cdd892920a4eabca458f72025a2e1b8b Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Thu, 14 Nov 2019 05:27:43 -0800 Subject: [PATCH 048/185] REF: make FrameApply less stateful, no self.results (#29585) --- pandas/core/apply.py | 51 +++++++++++++++++++++++++++----------------- 1 file changed, 31 insertions(+), 20 deletions(-) diff --git a/pandas/core/apply.py b/pandas/core/apply.py index 9c5806a3fe945..6302499b6d153 100644 --- a/pandas/core/apply.py +++ b/pandas/core/apply.py @@ -1,6 +1,6 @@ import abc import inspect -from typing import TYPE_CHECKING, Iterator, Type +from typing import TYPE_CHECKING, Any, Dict, Iterator, Optional, Type, Union import numpy as np @@ -18,6 +18,8 @@ if TYPE_CHECKING: from pandas import DataFrame, Series, Index +ResType = Dict[int, Any] + def frame_apply( obj: "DataFrame", @@ -64,10 +66,15 @@ def result_index(self) -> "Index": def result_columns(self) -> "Index": pass + @property @abc.abstractmethod def series_generator(self) -> Iterator["Series"]: pass + @abc.abstractmethod + def wrap_results_for_axis(self, results: ResType) -> Union["Series", "DataFrame"]: + pass + # --------------------------------------------------------------- def __init__( @@ -107,8 +114,16 @@ def f(x): # results self.result = None - self.res_index = None - self.res_columns = None + self._res_index: Optional["Index"] = None + + @property + def res_index(self) -> "Index": + assert self._res_index is not None + return self._res_index + + @property + def res_columns(self) -> "Index": + return self.result_columns @property def columns(self) -> "Index": @@ -298,12 +313,12 @@ def apply_standard(self): return self.obj._constructor_sliced(result, index=labels) # compute the result using the series generator - self.apply_series_generator() + results = self.apply_series_generator() # wrap results - return self.wrap_results() + return self.wrap_results(results) - def apply_series_generator(self): + def apply_series_generator(self) -> ResType: series_gen = self.series_generator res_index = self.result_index @@ -330,17 +345,15 @@ def apply_series_generator(self): results[i] = self.f(v) keys.append(v.name) - self.results = results - self.res_index = res_index - self.res_columns = self.result_columns + self._res_index = res_index + return results - def wrap_results(self): - results = self.results + def wrap_results(self, results: ResType) -> Union["Series", "DataFrame"]: # see if we can infer the results if len(results) > 0 and 0 in results and is_sequence(results[0]): - return self.wrap_results_for_axis() + return self.wrap_results_for_axis(results) # dict of scalars result = self.obj._constructor_sliced(results) @@ -367,10 +380,9 @@ def result_index(self) -> "Index": def result_columns(self) -> "Index": return self.index - def wrap_results_for_axis(self): + def wrap_results_for_axis(self, results: ResType) -> "DataFrame": """ return the results for the rows """ - results = self.results result = self.obj._constructor(data=results) if not isinstance(results[0], ABCSeries): @@ -406,13 +418,13 @@ def result_index(self) -> "Index": def result_columns(self) -> "Index": return self.columns - def wrap_results_for_axis(self): + def wrap_results_for_axis(self, results: ResType) -> Union["Series", "DataFrame"]: """ return the results for the columns """ - results = self.results + result: Union["Series", "DataFrame"] # we have requested to expand if self.result_type == "expand": - result = self.infer_to_same_shape() + result = self.infer_to_same_shape(results) # we have a non-series and don't want inference elif not isinstance(results[0], ABCSeries): @@ -423,13 +435,12 @@ def wrap_results_for_axis(self): # we may want to infer results else: - result = self.infer_to_same_shape() + result = self.infer_to_same_shape(results) return result - def infer_to_same_shape(self) -> "DataFrame": + def infer_to_same_shape(self, results: ResType) -> "DataFrame": """ infer the results to the same shape as the input object """ - results = self.results result = self.obj._constructor(data=results) result = result.T From 505b6e7e4ca7aab0413298a018d7f46a0a53326c Mon Sep 17 00:00:00 2001 From: William Ayd Date: Thu, 14 Nov 2019 05:29:50 -0800 Subject: [PATCH 049/185] Fixed tokenizer build warnings (#29613) --- pandas/_libs/src/parser/tokenizer.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/_libs/src/parser/tokenizer.h b/pandas/_libs/src/parser/tokenizer.h index 802b58d8ec916..4903e936dc348 100644 --- a/pandas/_libs/src/parser/tokenizer.h +++ b/pandas/_libs/src/parser/tokenizer.h @@ -162,9 +162,9 @@ typedef struct parser_t { int64_t skip_footer; // pick one, depending on whether the converter requires GIL double (*double_converter_nogil)(const char *, char **, - char, char, char, int); + char, char, char, int, int *, int *); double (*double_converter_withgil)(const char *, char **, - char, char, char, int); + char, char, char, int, int *, int *); // error handling char *warn_msg; From 65a4ee637e9d68e54b1b49418221f813750fc729 Mon Sep 17 00:00:00 2001 From: MomIsBestFriend <50263213+MomIsBestFriend@users.noreply.github.com> Date: Thu, 14 Nov 2019 16:28:52 +0200 Subject: [PATCH 050/185] CLN: f-string at pandas/_libs/tslib.pyx (#29593) --- pandas/_libs/tslib.pyx | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) diff --git a/pandas/_libs/tslib.pyx b/pandas/_libs/tslib.pyx index 01d90900cd604..598def4e1d9fa 100644 --- a/pandas/_libs/tslib.pyx +++ b/pandas/_libs/tslib.pyx @@ -266,20 +266,16 @@ def format_array_from_datetime(ndarray[int64_t] values, object tz=None, elif basic_format: dt64_to_dtstruct(val, &dts) - res = '%d-%.2d-%.2d %.2d:%.2d:%.2d' % (dts.year, - dts.month, - dts.day, - dts.hour, - dts.min, - dts.sec) + res = (f'{dts.year}-{dts.month:02d}-{dts.day:02d} ' + f'{dts.hour:02d}:{dts.min:02d}:{dts.sec:02d}') if show_ns: ns = dts.ps // 1000 - res += '.%.9d' % (ns + 1000 * dts.us) + res += f'.{ns + dts.us * 1000:09d}' elif show_us: - res += '.%.6d' % dts.us + res += f'.{dts.us:06d}' elif show_ms: - res += '.%.3d' % (dts.us / 1000) + res += f'.{dts.us // 1000:03d}' result[i] = res From 8ad5c12de3c1497ffb593d46df8ff9a46dfe3ced Mon Sep 17 00:00:00 2001 From: ganevgv Date: Thu, 14 Nov 2019 16:47:48 +0200 Subject: [PATCH 051/185] TST: add test for .unique() dtype preserving (#29515) --- pandas/tests/test_algos.py | 37 +++++++++++++++++++++++++++++++++++++ 1 file changed, 37 insertions(+) diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index ef844dd97120a..0bc09ddc40035 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -10,6 +10,13 @@ from pandas.compat.numpy import np_array_datetime64_compat import pandas.util._test_decorators as td +from pandas.core.dtypes.common import ( + is_bool_dtype, + is_complex_dtype, + is_float_dtype, + is_integer_dtype, + is_object_dtype, +) from pandas.core.dtypes.dtypes import CategoricalDtype as CDT import pandas as pd @@ -23,6 +30,7 @@ Timestamp, compat, ) +from pandas.conftest import BYTES_DTYPES, STRING_DTYPES import pandas.core.algorithms as algos from pandas.core.arrays import DatetimeArray import pandas.core.common as com @@ -352,6 +360,35 @@ def test_on_index_object(self): tm.assert_almost_equal(result, expected) + def test_dtype_preservation(self, any_numpy_dtype): + # GH 15442 + if any_numpy_dtype in (BYTES_DTYPES + STRING_DTYPES): + pytest.skip("skip string dtype") + elif is_integer_dtype(any_numpy_dtype): + data = [1, 2, 2] + uniques = [1, 2] + elif is_float_dtype(any_numpy_dtype): + data = [1, 2, 2] + uniques = [1.0, 2.0] + elif is_complex_dtype(any_numpy_dtype): + data = [complex(1, 0), complex(2, 0), complex(2, 0)] + uniques = [complex(1, 0), complex(2, 0)] + elif is_bool_dtype(any_numpy_dtype): + data = [True, True, False] + uniques = [True, False] + elif is_object_dtype(any_numpy_dtype): + data = ["A", "B", "B"] + uniques = ["A", "B"] + else: + # datetime64[ns]/M8[ns]/timedelta64[ns]/m8[ns] tested elsewhere + data = [1, 2, 2] + uniques = [1, 2] + + result = Series(data, dtype=any_numpy_dtype).unique() + expected = np.array(uniques, dtype=any_numpy_dtype) + + tm.assert_numpy_array_equal(result, expected) + def test_datetime64_dtype_array_returned(self): # GH 9431 expected = np_array_datetime64_compat( From 6a5c860df0502e4c21948fc11f6e84a7199bfe75 Mon Sep 17 00:00:00 2001 From: Terji Petersen Date: Thu, 14 Nov 2019 15:16:35 +0000 Subject: [PATCH 052/185] CLN: remove kwargs from Index._simple_new (#29604) --- pandas/core/indexes/base.py | 7 +++---- pandas/core/indexes/datetimes.py | 6 ++++++ pandas/core/indexes/range.py | 7 ++----- pandas/core/indexes/timedeltas.py | 6 ++++++ 4 files changed, 17 insertions(+), 9 deletions(-) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 68736935ed36d..20ca176f4a456 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -506,7 +506,7 @@ def asi8(self): return None @classmethod - def _simple_new(cls, values, name=None, dtype=None, **kwargs): + def _simple_new(cls, values, name=None, dtype=None): """ We require that we have a dtype compat for the values. If we are passed a non-dtype compat, then coerce using the constructor. @@ -528,8 +528,7 @@ def _simple_new(cls, values, name=None, dtype=None, **kwargs): # we actually set this value too. result._index_data = values result.name = name - for k, v in kwargs.items(): - setattr(result, k, v) + return result._reset_identity() @cache_readonly @@ -2673,7 +2672,7 @@ def difference(self, other, sort=None): except TypeError: pass - return this._shallow_copy(the_diff, name=result_name, freq=None) + return this._shallow_copy(the_diff, name=result_name) def symmetric_difference(self, other, result_name=None, sort=None): """ diff --git a/pandas/core/indexes/datetimes.py b/pandas/core/indexes/datetimes.py index 4a3ee57084a8a..e19ebf17a1b98 100644 --- a/pandas/core/indexes/datetimes.py +++ b/pandas/core/indexes/datetimes.py @@ -464,6 +464,12 @@ def _convert_for_op(self, value): return _to_M8(value) raise ValueError("Passed item and index have different timezone") + @Appender(Index.difference.__doc__) + def difference(self, other, sort=None): + new_idx = super().difference(other, sort=sort) + new_idx.freq = None + return new_idx + # -------------------------------------------------------------------- # Rendering Methods diff --git a/pandas/core/indexes/range.py b/pandas/core/indexes/range.py index 962ba8cc00557..0fe0fe5a426ae 100644 --- a/pandas/core/indexes/range.py +++ b/pandas/core/indexes/range.py @@ -146,7 +146,7 @@ def from_range(cls, data, name=None, dtype=None): return cls._simple_new(data, dtype=dtype, name=name) @classmethod - def _simple_new(cls, values, name=None, dtype=None, **kwargs): + def _simple_new(cls, values, name=None, dtype=None): result = object.__new__(cls) # handle passed None, non-integers @@ -154,13 +154,10 @@ def _simple_new(cls, values, name=None, dtype=None, **kwargs): # empty values = range(0, 0, 1) elif not isinstance(values, range): - return Index(values, dtype=dtype, name=name, **kwargs) + return Index(values, dtype=dtype, name=name) result._range = values - result.name = name - for k, v in kwargs.items(): - setattr(result, k, v) result._reset_identity() return result diff --git a/pandas/core/indexes/timedeltas.py b/pandas/core/indexes/timedeltas.py index 8114b4a772f28..6caac43af163b 100644 --- a/pandas/core/indexes/timedeltas.py +++ b/pandas/core/indexes/timedeltas.py @@ -406,6 +406,12 @@ def intersection(self, other, sort=False): """ return super().intersection(other, sort=sort) + @Appender(Index.difference.__doc__) + def difference(self, other, sort=None): + new_idx = super().difference(other, sort=sort) + new_idx.freq = None + return new_idx + def _wrap_joined_index(self, joined, other): name = get_op_result_name(self, other) if ( From 4ef7313ae695ee51777e7de806b57c5ae2697a98 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Thu, 14 Nov 2019 08:29:38 -0800 Subject: [PATCH 053/185] REF: de-nest _get_cython_function (#29609) --- pandas/core/groupby/generic.py | 4 +-- pandas/core/groupby/ops.py | 56 +++++++++++++--------------------- 2 files changed, 23 insertions(+), 37 deletions(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index dec9a33c73f54..002d8640f109d 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -1058,14 +1058,14 @@ def _cython_agg_blocks( return new_items, new_blocks - def _aggregate_frame(self, func, *args, **kwargs): + def _aggregate_frame(self, func, *args, **kwargs) -> DataFrame: if self.grouper.nkeys != 1: raise AssertionError("Number of keys must be 1") axis = self.axis obj = self._obj_with_exclusions - result = OrderedDict() + result = OrderedDict() # type: OrderedDict if axis != obj._info_axis_number: for name, data in self: fres = func(data, *args, **kwargs) diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index 754d67d329538..7ed79e4b00371 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -201,7 +201,7 @@ def apply(self, f, data: FrameOrSeries, axis: int = 0): continue # group might be modified - group_axes = _get_axes(group) + group_axes = group.axes res = f(group) if not _is_indexed_like(res, group_axes): mutated = True @@ -358,40 +358,33 @@ def _is_builtin_func(self, arg): def _get_cython_function(self, kind: str, how: str, values, is_numeric: bool): dtype_str = values.dtype.name + ftype = self._cython_functions[kind][how] - def get_func(fname): - # see if there is a fused-type version of function - # only valid for numeric - f = getattr(libgroupby, fname, None) - if f is not None and is_numeric: - return f - - # otherwise find dtype-specific version, falling back to object - for dt in [dtype_str, "object"]: - f2 = getattr( - libgroupby, - "{fname}_{dtype_str}".format(fname=fname, dtype_str=dt), - None, - ) - if f2 is not None: - return f2 - - if hasattr(f, "__signatures__"): - # inspect what fused types are implemented - if dtype_str == "object" and "object" not in f.__signatures__: - # return None so we get a NotImplementedError below - # instead of a TypeError at runtime - return None + # see if there is a fused-type version of function + # only valid for numeric + f = getattr(libgroupby, ftype, None) + if f is not None and is_numeric: return f - ftype = self._cython_functions[kind][how] + # otherwise find dtype-specific version, falling back to object + for dt in [dtype_str, "object"]: + f2 = getattr(libgroupby, f"{ftype}_{dt}", None) + if f2 is not None: + return f2 + + if hasattr(f, "__signatures__"): + # inspect what fused types are implemented + if dtype_str == "object" and "object" not in f.__signatures__: + # disallow this function so we get a NotImplementedError below + # instead of a TypeError at runtime + f = None - func = get_func(ftype) + func = f if func is None: raise NotImplementedError( - "function is not implemented for this dtype: " - "[how->{how},dtype->{dtype_str}]".format(how=how, dtype_str=dtype_str) + f"function is not implemented for this dtype: " + f"[how->{how},dtype->{dtype_str}]" ) return func @@ -843,13 +836,6 @@ def agg_series(self, obj: Series, func): return grouper.get_result() -def _get_axes(group): - if isinstance(group, Series): - return [group.index] - else: - return group.axes - - def _is_indexed_like(obj, axes) -> bool: if isinstance(obj, Series): if len(axes) > 1: From bb9daef857c5fe982e2fbb13c96501eb67b2d223 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Thu, 14 Nov 2019 08:30:17 -0800 Subject: [PATCH 054/185] CLN: reachable cases in Reducer (#29610) --- pandas/_libs/reduction.pyx | 37 ++++++++---------------- pandas/tests/groupby/test_bin_groupby.py | 17 ++++++----- 2 files changed, 21 insertions(+), 33 deletions(-) diff --git a/pandas/_libs/reduction.pyx b/pandas/_libs/reduction.pyx index 3df26cbcf214a..8733249888ae9 100644 --- a/pandas/_libs/reduction.pyx +++ b/pandas/_libs/reduction.pyx @@ -81,7 +81,7 @@ cdef class Reducer: else: - # we passed a series-like + # we passed a Series typ = type(dummy) index = dummy.index dummy = dummy.values @@ -99,7 +99,6 @@ cdef class Reducer: ndarray arr, result, chunk Py_ssize_t i flatiter it - bint has_labels object res, name, labels object cached_typ = None @@ -108,8 +107,6 @@ cdef class Reducer: dummy_buf = chunk.data chunk.data = arr.data labels = self.labels - has_labels = labels is not None - has_index = self.index is not None result = np.empty(self.nresults, dtype='O') it = PyArray_IterNew(result) @@ -117,33 +114,19 @@ cdef class Reducer: try: for i in range(self.nresults): - if has_labels: - name = labels[i] - else: - name = None - # create the cached type # each time just reassign the data if i == 0: if self.typ is not None: - - # recreate with the index if supplied - if has_index: - - cached_typ = self.typ( - chunk, index=self.index, name=name) - - else: - - # use the passsed typ, sans index - cached_typ = self.typ(chunk, name=name) + # In this case, we also have self.index + name = labels[i] + cached_typ = self.typ(chunk, index=self.index, name=name) # use the cached_typ if possible if cached_typ is not None: - - if has_index: - object.__setattr__(cached_typ, 'index', self.index) + # In this case, we also have non-None labels + name = labels[i] object.__setattr__( cached_typ._data._block, 'values', chunk) @@ -607,18 +590,22 @@ cdef class BlockSlider: arr.shape[1] = 0 -def compute_reduction(arr, f, axis=0, dummy=None, labels=None): +def compute_reduction(arr: np.ndarray, f, axis: int = 0, dummy=None, labels=None): """ Parameters ----------- - arr : NDFrame object + arr : np.ndarray f : function axis : integer axis dummy : type of reduced output (series) labels : Index or None """ + # We either have both dummy and labels, or neither of them + if (labels is None) ^ (dummy is None): + raise ValueError("Must pass either dummy and labels, or neither") + if labels is not None: # Caller is responsible for ensuring we don't have MultiIndex assert labels.nlevels == 1 diff --git a/pandas/tests/groupby/test_bin_groupby.py b/pandas/tests/groupby/test_bin_groupby.py index 4ede6b165c691..fcdf599e4ba33 100644 --- a/pandas/tests/groupby/test_bin_groupby.py +++ b/pandas/tests/groupby/test_bin_groupby.py @@ -116,15 +116,16 @@ class TestMoments: class TestReducer: def test_int_index(self): arr = np.random.randn(100, 4) - result = libreduction.compute_reduction(arr, np.sum, labels=Index(np.arange(4))) - expected = arr.sum(0) - tm.assert_almost_equal(result, expected) - result = libreduction.compute_reduction( - arr, np.sum, axis=1, labels=Index(np.arange(100)) - ) - expected = arr.sum(1) - tm.assert_almost_equal(result, expected) + msg = "Must pass either dummy and labels, or neither" + # we must pass either both labels and dummy, or neither + with pytest.raises(ValueError, match=msg): + libreduction.compute_reduction(arr, np.sum, labels=Index(np.arange(4))) + + with pytest.raises(ValueError, match=msg): + libreduction.compute_reduction( + arr, np.sum, axis=1, labels=Index(np.arange(100)) + ) dummy = Series(0.0, index=np.arange(100)) result = libreduction.compute_reduction( From 79e1fc3d1242901a3080be9bbe8c7a4f8769a748 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Thu, 14 Nov 2019 08:31:54 -0800 Subject: [PATCH 055/185] REF: boilerplate for ops internal consistency (#28037) --- pandas/core/arrays/categorical.py | 9 ++- pandas/core/arrays/datetimelike.py | 24 +++----- pandas/core/arrays/datetimes.py | 13 +---- pandas/core/arrays/integer.py | 18 ++---- pandas/core/arrays/period.py | 8 +-- pandas/core/arrays/sparse/array.py | 19 ++----- pandas/core/arrays/timedeltas.py | 5 +- pandas/core/indexes/range.py | 11 ++-- pandas/core/ops/__init__.py | 14 ++--- pandas/core/ops/common.py | 66 ++++++++++++++++++++++ pandas/tests/arithmetic/test_datetime64.py | 1 + 11 files changed, 105 insertions(+), 83 deletions(-) create mode 100644 pandas/core/ops/common.py diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 73d1db9bda8ed..817972b3356a2 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -8,7 +8,7 @@ from pandas._config import get_option -from pandas._libs import algos as libalgos, hashtable as htable, lib +from pandas._libs import algos as libalgos, hashtable as htable from pandas.compat.numpy import function as nv from pandas.util._decorators import ( Appender, @@ -39,7 +39,7 @@ needs_i8_conversion, ) from pandas.core.dtypes.dtypes import CategoricalDtype -from pandas.core.dtypes.generic import ABCDataFrame, ABCIndexClass, ABCSeries +from pandas.core.dtypes.generic import ABCIndexClass, ABCSeries from pandas.core.dtypes.inference import is_hashable from pandas.core.dtypes.missing import isna, notna @@ -52,6 +52,7 @@ import pandas.core.common as com from pandas.core.construction import array, extract_array, sanitize_array from pandas.core.missing import interpolate_2d +from pandas.core.ops.common import unpack_zerodim_and_defer from pandas.core.sorting import nargsort from pandas.io.formats import console @@ -74,16 +75,14 @@ def _cat_compare_op(op): opname = "__{op}__".format(op=op.__name__) + @unpack_zerodim_and_defer(opname) def f(self, other): # On python2, you can usually compare any type to any type, and # Categoricals can be seen as a custom type, but having different # results depending whether categories are the same or not is kind of # insane, so be a bit stricter here and use the python3 idea of # comparing only things of equal type. - if isinstance(other, (ABCDataFrame, ABCSeries, ABCIndexClass)): - return NotImplemented - other = lib.item_from_zerodim(other) if is_list_like(other) and len(other) != len(self): # TODO: Could this fail if the categories are listlike objects? raise ValueError("Lengths must match.") diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index 287ff9d618501..e52bc17fcc319 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -33,12 +33,7 @@ is_unsigned_integer_dtype, pandas_dtype, ) -from pandas.core.dtypes.generic import ( - ABCDataFrame, - ABCIndexClass, - ABCPeriodArray, - ABCSeries, -) +from pandas.core.dtypes.generic import ABCIndexClass, ABCPeriodArray, ABCSeries from pandas.core.dtypes.inference import is_array_like from pandas.core.dtypes.missing import is_valid_nat_for_dtype, isna @@ -46,6 +41,7 @@ from pandas.core import missing, nanops from pandas.core.algorithms import checked_add_with_arr, take, unique1d, value_counts import pandas.core.common as com +from pandas.core.ops.common import unpack_zerodim_and_defer from pandas.core.ops.invalid import make_invalid_op from pandas.tseries import frequencies @@ -1194,13 +1190,11 @@ def _time_shift(self, periods, freq=None): # to be passed explicitly. return self._generate_range(start=start, end=end, periods=None, freq=self.freq) + @unpack_zerodim_and_defer("__add__") def __add__(self, other): - other = lib.item_from_zerodim(other) - if isinstance(other, (ABCSeries, ABCDataFrame, ABCIndexClass)): - return NotImplemented # scalar others - elif other is NaT: + if other is NaT: result = self._add_nat() elif isinstance(other, (Tick, timedelta, np.timedelta64)): result = self._add_delta(other) @@ -1248,13 +1242,11 @@ def __radd__(self, other): # alias for __add__ return self.__add__(other) + @unpack_zerodim_and_defer("__sub__") def __sub__(self, other): - other = lib.item_from_zerodim(other) - if isinstance(other, (ABCSeries, ABCDataFrame, ABCIndexClass)): - return NotImplemented # scalar others - elif other is NaT: + if other is NaT: result = self._sub_nat() elif isinstance(other, (Tick, timedelta, np.timedelta64)): result = self._add_delta(-other) @@ -1343,11 +1335,11 @@ def __rsub__(self, other): return -(self - other) # FIXME: DTA/TDA/PA inplace methods should actually be inplace, GH#24115 - def __iadd__(self, other): + def __iadd__(self, other): # type: ignore # alias for __add__ return self.__add__(other) - def __isub__(self, other): + def __isub__(self, other): # type: ignore # alias for __sub__ return self.__sub__(other) diff --git a/pandas/core/arrays/datetimes.py b/pandas/core/arrays/datetimes.py index 7cd103d12fa8a..8e3c727a14c99 100644 --- a/pandas/core/arrays/datetimes.py +++ b/pandas/core/arrays/datetimes.py @@ -40,12 +40,7 @@ pandas_dtype, ) from pandas.core.dtypes.dtypes import DatetimeTZDtype -from pandas.core.dtypes.generic import ( - ABCDataFrame, - ABCIndexClass, - ABCPandasArray, - ABCSeries, -) +from pandas.core.dtypes.generic import ABCIndexClass, ABCPandasArray, ABCSeries from pandas.core.dtypes.missing import isna from pandas.core import ops @@ -53,6 +48,7 @@ from pandas.core.arrays import datetimelike as dtl from pandas.core.arrays._ranges import generate_regular_range import pandas.core.common as com +from pandas.core.ops.common import unpack_zerodim_and_defer from pandas.core.ops.invalid import invalid_comparison from pandas.tseries.frequencies import get_period_alias, to_offset @@ -157,11 +153,8 @@ def _dt_array_cmp(cls, op): opname = "__{name}__".format(name=op.__name__) nat_result = opname == "__ne__" + @unpack_zerodim_and_defer(opname) def wrapper(self, other): - if isinstance(other, (ABCDataFrame, ABCSeries, ABCIndexClass)): - return NotImplemented - - other = lib.item_from_zerodim(other) if isinstance(other, (datetime, np.datetime64, str)): if isinstance(other, (datetime, np.datetime64)): diff --git a/pandas/core/arrays/integer.py b/pandas/core/arrays/integer.py index 41d8bffd8c131..e167e556b244a 100644 --- a/pandas/core/arrays/integer.py +++ b/pandas/core/arrays/integer.py @@ -21,12 +21,12 @@ is_scalar, ) from pandas.core.dtypes.dtypes import register_extension_dtype -from pandas.core.dtypes.generic import ABCDataFrame, ABCIndexClass, ABCSeries from pandas.core.dtypes.missing import isna, notna from pandas.core import nanops, ops from pandas.core.algorithms import take from pandas.core.arrays import ExtensionArray, ExtensionOpsMixin +from pandas.core.ops.common import unpack_zerodim_and_defer from pandas.core.tools.numeric import to_numeric @@ -602,13 +602,8 @@ def _values_for_argsort(self) -> np.ndarray: def _create_comparison_method(cls, op): op_name = op.__name__ + @unpack_zerodim_and_defer(op.__name__) def cmp_method(self, other): - - if isinstance(other, (ABCDataFrame, ABCSeries, ABCIndexClass)): - # Rely on pandas to unbox and dispatch to us. - return NotImplemented - - other = lib.item_from_zerodim(other) mask = None if isinstance(other, IntegerArray): @@ -697,15 +692,14 @@ def _maybe_mask_result(self, result, mask, other, op_name): def _create_arithmetic_method(cls, op): op_name = op.__name__ + @unpack_zerodim_and_defer(op.__name__) def integer_arithmetic_method(self, other): - if isinstance(other, (ABCDataFrame, ABCSeries, ABCIndexClass)): - # Rely on pandas to unbox and dispatch to us. - return NotImplemented - - other = lib.item_from_zerodim(other) mask = None + if getattr(other, "ndim", 0) > 1: + raise NotImplementedError("can only perform ops with 1-d structures") + if isinstance(other, IntegerArray): other, mask = other._data, other._mask diff --git a/pandas/core/arrays/period.py b/pandas/core/arrays/period.py index 78cc54db4b1b8..fdf4059fad569 100644 --- a/pandas/core/arrays/period.py +++ b/pandas/core/arrays/period.py @@ -4,7 +4,6 @@ import numpy as np -from pandas._libs import lib from pandas._libs.tslibs import ( NaT, NaTType, @@ -35,7 +34,6 @@ ) from pandas.core.dtypes.dtypes import PeriodDtype from pandas.core.dtypes.generic import ( - ABCDataFrame, ABCIndexClass, ABCPeriodArray, ABCPeriodIndex, @@ -46,6 +44,7 @@ import pandas.core.algorithms as algos from pandas.core.arrays import datetimelike as dtl import pandas.core.common as com +from pandas.core.ops.common import unpack_zerodim_and_defer from pandas.tseries import frequencies from pandas.tseries.offsets import DateOffset, Tick, _delta_to_tick @@ -69,13 +68,10 @@ def _period_array_cmp(cls, op): opname = "__{name}__".format(name=op.__name__) nat_result = opname == "__ne__" + @unpack_zerodim_and_defer(opname) def wrapper(self, other): ordinal_op = getattr(self.asi8, opname) - other = lib.item_from_zerodim(other) - if isinstance(other, (ABCDataFrame, ABCSeries, ABCIndexClass)): - return NotImplemented - if is_list_like(other) and len(other) != len(self): raise ValueError("Lengths must match") diff --git a/pandas/core/arrays/sparse/array.py b/pandas/core/arrays/sparse/array.py index 14024401ea110..943dea4252499 100644 --- a/pandas/core/arrays/sparse/array.py +++ b/pandas/core/arrays/sparse/array.py @@ -34,12 +34,7 @@ is_string_dtype, pandas_dtype, ) -from pandas.core.dtypes.generic import ( - ABCDataFrame, - ABCIndexClass, - ABCSeries, - ABCSparseArray, -) +from pandas.core.dtypes.generic import ABCIndexClass, ABCSeries, ABCSparseArray from pandas.core.dtypes.missing import isna, na_value_for_dtype, notna import pandas.core.algorithms as algos @@ -49,6 +44,7 @@ from pandas.core.construction import sanitize_array from pandas.core.missing import interpolate_2d import pandas.core.ops as ops +from pandas.core.ops.common import unpack_zerodim_and_defer import pandas.io.formats.printing as printing @@ -1410,12 +1406,8 @@ def sparse_unary_method(self): def _create_arithmetic_method(cls, op): op_name = op.__name__ + @unpack_zerodim_and_defer(op_name) def sparse_arithmetic_method(self, other): - if isinstance(other, (ABCDataFrame, ABCSeries, ABCIndexClass)): - # Rely on pandas to dispatch to us. - return NotImplemented - - other = lib.item_from_zerodim(other) if isinstance(other, SparseArray): return _sparse_array_op(self, other, op, op_name) @@ -1463,12 +1455,9 @@ def _create_comparison_method(cls, op): if op_name in {"and_", "or_"}: op_name = op_name[:-1] + @unpack_zerodim_and_defer(op_name) def cmp_method(self, other): - if isinstance(other, (ABCSeries, ABCIndexClass)): - # Rely on pandas to unbox and dispatch to us. - return NotImplemented - if not is_scalar(other) and not isinstance(other, type(self)): # convert list-like to ndarray other = np.asarray(other) diff --git a/pandas/core/arrays/timedeltas.py b/pandas/core/arrays/timedeltas.py index 21e07b5101a64..816beb758dd33 100644 --- a/pandas/core/arrays/timedeltas.py +++ b/pandas/core/arrays/timedeltas.py @@ -45,6 +45,7 @@ from pandas.core import nanops from pandas.core.algorithms import checked_add_with_arr import pandas.core.common as com +from pandas.core.ops.common import unpack_zerodim_and_defer from pandas.core.ops.invalid import invalid_comparison from pandas.tseries.frequencies import to_offset @@ -82,10 +83,8 @@ def _td_array_cmp(cls, op): opname = "__{name}__".format(name=op.__name__) nat_result = opname == "__ne__" + @unpack_zerodim_and_defer(opname) def wrapper(self, other): - other = lib.item_from_zerodim(other) - if isinstance(other, (ABCDataFrame, ABCSeries, ABCIndexClass)): - return NotImplemented if _is_convertible_to_td(other) or other is NaT: try: diff --git a/pandas/core/indexes/range.py b/pandas/core/indexes/range.py index 0fe0fe5a426ae..6f806c5bab6e4 100644 --- a/pandas/core/indexes/range.py +++ b/pandas/core/indexes/range.py @@ -21,7 +21,7 @@ is_scalar, is_timedelta64_dtype, ) -from pandas.core.dtypes.generic import ABCDataFrame, ABCSeries, ABCTimedeltaIndex +from pandas.core.dtypes.generic import ABCTimedeltaIndex from pandas.core import ops import pandas.core.common as com @@ -29,6 +29,7 @@ import pandas.core.indexes.base as ibase from pandas.core.indexes.base import Index, _index_shared_docs from pandas.core.indexes.numeric import Int64Index +from pandas.core.ops.common import unpack_zerodim_and_defer from pandas.io.formats.printing import pprint_thing @@ -731,9 +732,8 @@ def __getitem__(self, key): # fall back to Int64Index return super().__getitem__(key) + @unpack_zerodim_and_defer("__floordiv__") def __floordiv__(self, other): - if isinstance(other, (ABCSeries, ABCDataFrame)): - return NotImplemented if is_integer(other) and other != 0: if len(self) == 0 or self.start % other == 0 and self.step % other == 0: @@ -769,10 +769,9 @@ def _make_evaluate_binop(op, step=False): if False, use the existing step """ + @unpack_zerodim_and_defer(op.__name__) def _evaluate_numeric_binop(self, other): - if isinstance(other, (ABCSeries, ABCDataFrame)): - return NotImplemented - elif isinstance(other, ABCTimedeltaIndex): + if isinstance(other, ABCTimedeltaIndex): # Defer to TimedeltaIndex implementation return NotImplemented elif isinstance(other, (timedelta, np.timedelta64)): diff --git a/pandas/core/ops/__init__.py b/pandas/core/ops/__init__.py index 398fa9b0c1fc0..f7a1258894b89 100644 --- a/pandas/core/ops/__init__.py +++ b/pandas/core/ops/__init__.py @@ -29,6 +29,7 @@ logical_op, ) from pandas.core.ops.array_ops import comp_method_OBJECT_ARRAY # noqa:F401 +from pandas.core.ops.common import unpack_zerodim_and_defer from pandas.core.ops.dispatch import maybe_dispatch_ufunc_to_dunder_op # noqa:F401 from pandas.core.ops.dispatch import should_series_dispatch from pandas.core.ops.docstrings import ( @@ -489,9 +490,8 @@ def _arith_method_SERIES(cls, op, special): op_name = _get_op_name(op, special) eval_kwargs = _gen_eval_kwargs(op_name) + @unpack_zerodim_and_defer(op_name) def wrapper(left, right): - if isinstance(right, ABCDataFrame): - return NotImplemented left, right = _align_method_SERIES(left, right) res_name = get_op_result_name(left, right) @@ -512,14 +512,11 @@ def _comp_method_SERIES(cls, op, special): """ op_name = _get_op_name(op, special) + @unpack_zerodim_and_defer(op_name) def wrapper(self, other): res_name = get_op_result_name(self, other) - if isinstance(other, ABCDataFrame): # pragma: no cover - # Defer to DataFrame implementation; fail early - return NotImplemented - if isinstance(other, ABCSeries) and not self._indexed_same(other): raise ValueError("Can only compare identically-labeled Series objects") @@ -541,14 +538,11 @@ def _bool_method_SERIES(cls, op, special): """ op_name = _get_op_name(op, special) + @unpack_zerodim_and_defer(op_name) def wrapper(self, other): self, other = _align_method_SERIES(self, other, align_asobject=True) res_name = get_op_result_name(self, other) - if isinstance(other, ABCDataFrame): - # Defer to DataFrame implementation; fail early - return NotImplemented - lvalues = extract_array(self, extract_numpy=True) rvalues = extract_array(other, extract_numpy=True) diff --git a/pandas/core/ops/common.py b/pandas/core/ops/common.py new file mode 100644 index 0000000000000..f4b16cf4a0cf2 --- /dev/null +++ b/pandas/core/ops/common.py @@ -0,0 +1,66 @@ +""" +Boilerplate functions used in defining binary operations. +""" +from functools import wraps + +from pandas._libs.lib import item_from_zerodim + +from pandas.core.dtypes.generic import ABCDataFrame, ABCIndexClass, ABCSeries + + +def unpack_zerodim_and_defer(name: str): + """ + Boilerplate for pandas conventions in arithmetic and comparison methods. + + Parameters + ---------- + name : str + + Returns + ------- + decorator + """ + + def wrapper(method): + return _unpack_zerodim_and_defer(method, name) + + return wrapper + + +def _unpack_zerodim_and_defer(method, name: str): + """ + Boilerplate for pandas conventions in arithmetic and comparison methods. + + Ensure method returns NotImplemented when operating against "senior" + classes. Ensure zero-dimensional ndarrays are always unpacked. + + Parameters + ---------- + method : binary method + name : str + + Returns + ------- + method + """ + + is_cmp = name.strip("__") in {"eq", "ne", "lt", "le", "gt", "ge"} + + @wraps(method) + def new_method(self, other): + + if is_cmp and isinstance(self, ABCIndexClass) and isinstance(other, ABCSeries): + # For comparison ops, Index does *not* defer to Series + pass + else: + for cls in [ABCDataFrame, ABCSeries, ABCIndexClass]: + if isinstance(self, cls): + break + if isinstance(other, cls): + return NotImplemented + + other = item_from_zerodim(other) + + return method(self, other) + + return new_method diff --git a/pandas/tests/arithmetic/test_datetime64.py b/pandas/tests/arithmetic/test_datetime64.py index 4d3d6e2df35db..1ba0930c06334 100644 --- a/pandas/tests/arithmetic/test_datetime64.py +++ b/pandas/tests/arithmetic/test_datetime64.py @@ -1029,6 +1029,7 @@ def test_dt64arr_add_sub_invalid(self, dti_freq, other, box_with_array): [ "unsupported operand type", "cannot (add|subtract)", + "cannot use operands with types", "ufunc '?(add|subtract)'? cannot use operands with types", ] ) From 2bf5ce76ded6e05aba88f27adc6459d32c5f6144 Mon Sep 17 00:00:00 2001 From: Aivengoe Date: Thu, 14 Nov 2019 21:06:08 +0300 Subject: [PATCH 056/185] Replace _has_complex_internals #29227 (#29237) --- pandas/core/apply.py | 4 ++-- pandas/core/indexes/base.py | 5 ----- pandas/core/indexes/multi.py | 5 ----- 3 files changed, 2 insertions(+), 12 deletions(-) diff --git a/pandas/core/apply.py b/pandas/core/apply.py index 6302499b6d153..94b7c59b93563 100644 --- a/pandas/core/apply.py +++ b/pandas/core/apply.py @@ -13,7 +13,7 @@ is_list_like, is_sequence, ) -from pandas.core.dtypes.generic import ABCSeries +from pandas.core.dtypes.generic import ABCMultiIndex, ABCSeries if TYPE_CHECKING: from pandas import DataFrame, Series, Index @@ -281,7 +281,7 @@ def apply_standard(self): and not self.dtypes.apply(is_extension_array_dtype).any() # Disallow complex_internals since libreduction shortcut # cannot handle MultiIndex - and not self.agg_axis._has_complex_internals + and not isinstance(self.agg_axis, ABCMultiIndex) ): values = self.values diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 20ca176f4a456..5ac361a83abd0 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -4086,11 +4086,6 @@ def _assert_can_do_op(self, value): msg = "'value' must be a scalar, passed: {0}" raise TypeError(msg.format(type(value).__name__)) - @property - def _has_complex_internals(self): - # to disable groupby tricks in MultiIndex - return False - def _is_memory_usage_qualified(self): """ Return a boolean if we need a qualified .info display. diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index a83fd6bf59f05..e768a5b6dd23c 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -1396,11 +1396,6 @@ def values(self): self._tuples = lib.fast_zip(values) return self._tuples - @property - def _has_complex_internals(self): - # to disable groupby tricks - return True - @cache_readonly def is_monotonic_increasing(self): """ From 051175401f16dd9812a6a93cf8e145ecd755856f Mon Sep 17 00:00:00 2001 From: Lucas Scarlato Astur Date: Thu, 14 Nov 2019 15:07:09 -0300 Subject: [PATCH 057/185] Update nullable integer docs with None instead of np.nan (#29619) Update nullable integer docs with None instead of np.nan --- doc/source/user_guide/integer_na.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/user_guide/integer_na.rst b/doc/source/user_guide/integer_na.rst index db6869a3804bd..f1f3d79eed61e 100644 --- a/doc/source/user_guide/integer_na.rst +++ b/doc/source/user_guide/integer_na.rst @@ -30,7 +30,7 @@ you must explicitly pass the dtype into :meth:`array` or :class:`Series`: .. ipython:: python - arr = pd.array([1, 2, np.nan], dtype=pd.Int64Dtype()) + arr = pd.array([1, 2, None], dtype=pd.Int64Dtype()) arr Or the string alias ``"Int64"`` (note the capital ``"I"``, to differentiate from From 193a46e3a2075db1c595cd39d83f5bd4845ab040 Mon Sep 17 00:00:00 2001 From: Jeremy Schendel Date: Thu, 14 Nov 2019 12:07:33 -0700 Subject: [PATCH 058/185] STYLE: Specify target-version with black (#29607) --- ci/code_checks.sh | 2 +- pandas/core/frame.py | 12 ++++++------ pandas/core/generic.py | 8 ++++---- pandas/core/groupby/groupby.py | 4 ++-- pandas/core/indexes/base.py | 2 +- pandas/core/indexes/datetimes.py | 8 ++++---- pandas/core/indexes/interval.py | 2 +- pandas/core/indexes/multi.py | 4 ++-- pandas/core/indexes/period.py | 2 +- pandas/core/internals/blocks.py | 10 +++++----- pandas/core/internals/managers.py | 2 +- pandas/core/missing.py | 4 ++-- pandas/core/resample.py | 6 +++--- pandas/core/reshape/pivot.py | 2 +- pandas/core/series.py | 8 ++++---- pandas/core/strings.py | 4 ++-- pandas/core/window/rolling.py | 4 ++-- pandas/io/common.py | 2 +- pandas/io/excel/_base.py | 14 +++++++------- pandas/io/excel/_xlsxwriter.py | 4 ++-- pandas/io/formats/format.py | 4 ++-- pandas/io/gbq.py | 2 +- pandas/io/parquet.py | 12 ++++++------ pandas/io/parsers.py | 2 +- pandas/plotting/_core.py | 20 ++++++++++---------- pandas/plotting/_matplotlib/boxplot.py | 12 ++++++------ pandas/plotting/_matplotlib/core.py | 14 +++++++------- pandas/plotting/_matplotlib/hist.py | 16 ++++++++-------- pandas/plotting/_matplotlib/misc.py | 6 +++--- pandas/plotting/_matplotlib/tools.py | 2 +- pandas/plotting/_misc.py | 12 ++++++------ pandas/tests/groupby/test_categorical.py | 2 +- pandas/tests/reshape/test_pivot.py | 10 +++++----- pandas/tests/test_multilevel.py | 8 ++++---- pandas/tests/test_nanops.py | 24 ++++++++++++------------ pandas/tests/window/test_moments.py | 2 +- pandas/util/testing.py | 8 ++++---- pyproject.toml | 20 ++++++++++++++++++++ 38 files changed, 150 insertions(+), 130 deletions(-) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index ab7bd7895a596..d5566c522ac64 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -56,7 +56,7 @@ if [[ -z "$CHECK" || "$CHECK" == "lint" ]]; then black --version MSG='Checking black formatting' ; echo $MSG - black . --check --exclude '(asv_bench/env|\.egg|\.git|\.hg|\.mypy_cache|\.nox|\.tox|\.venv|_build|buck-out|build|dist|setup.py)' + black . --check RET=$(($RET + $?)) ; echo $MSG "DONE" # `setup.cfg` contains the list of error codes that are being ignored in flake8 diff --git a/pandas/core/frame.py b/pandas/core/frame.py index ce74081fb655b..ca421e9695888 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -2082,7 +2082,7 @@ def to_stata( data_label=data_label, write_index=write_index, variable_labels=variable_labels, - **kwargs + **kwargs, ) writer.write_file() @@ -2106,7 +2106,7 @@ def to_parquet( compression="snappy", index=None, partition_cols=None, - **kwargs + **kwargs, ): """ Write a DataFrame to the binary parquet format. @@ -2186,7 +2186,7 @@ def to_parquet( compression=compression, index=index, partition_cols=partition_cols, - **kwargs + **kwargs, ) @Substitution( @@ -4110,7 +4110,7 @@ def fillna( inplace=False, limit=None, downcast=None, - **kwargs + **kwargs, ): return super().fillna( value=value, @@ -4119,7 +4119,7 @@ def fillna( inplace=inplace, limit=limit, downcast=downcast, - **kwargs + **kwargs, ) @Appender(_shared_docs["replace"] % _shared_doc_kwargs) @@ -6566,7 +6566,7 @@ def _gotitem( see_also=_agg_summary_and_see_also_doc, examples=_agg_examples_doc, versionadded="\n.. versionadded:: 0.20.0\n", - **_shared_doc_kwargs + **_shared_doc_kwargs, ) @Appender(_shared_docs["aggregate"]) def aggregate(self, func, axis=0, *args, **kwargs): diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 47a0582edbea4..4e024bba3c999 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -2055,7 +2055,7 @@ def __getstate__(self): _typ=self._typ, _metadata=self._metadata, attrs=self.attrs, - **meta + **meta, ) def __setstate__(self, state): @@ -7050,7 +7050,7 @@ def interpolate( limit_direction="forward", limit_area=None, downcast=None, - **kwargs + **kwargs, ): """ Interpolate values according to different methods. @@ -7124,7 +7124,7 @@ def interpolate( limit_area=limit_area, inplace=inplace, downcast=downcast, - **kwargs + **kwargs, ) if inplace: @@ -11572,7 +11572,7 @@ def stat_func( level=None, numeric_only=None, min_count=0, - **kwargs + **kwargs, ): if name == "sum": nv.validate_sum(tuple(), kwargs) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 204346bb7b741..280f1e88b0ea8 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -1214,7 +1214,7 @@ def median(self, **kwargs): return self._cython_agg_general( "median", alt=lambda x, axis: Series(x).median(axis=axis, **kwargs), - **kwargs + **kwargs, ) @Substitution(name="groupby") @@ -2181,7 +2181,7 @@ def _get_cythonized_result( result_is_index: bool = False, pre_processing=None, post_processing=None, - **kwargs + **kwargs, ): """ Get result for Cythonized functions. diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 5ac361a83abd0..5b57d3f096b0c 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -265,7 +265,7 @@ def __new__( name=None, fastpath=None, tupleize_cols=True, - **kwargs + **kwargs, ) -> "Index": from .range import RangeIndex diff --git a/pandas/core/indexes/datetimes.py b/pandas/core/indexes/datetimes.py index e19ebf17a1b98..aee9be20a1593 100644 --- a/pandas/core/indexes/datetimes.py +++ b/pandas/core/indexes/datetimes.py @@ -1428,7 +1428,7 @@ def date_range( normalize=False, name=None, closed=None, - **kwargs + **kwargs, ): """ Return a fixed frequency DatetimeIndex. @@ -1578,7 +1578,7 @@ def date_range( tz=tz, normalize=normalize, closed=closed, - **kwargs + **kwargs, ) return DatetimeIndex._simple_new(dtarr, tz=dtarr.tz, freq=dtarr.freq, name=name) @@ -1594,7 +1594,7 @@ def bdate_range( weekmask=None, holidays=None, closed=None, - **kwargs + **kwargs, ): """ Return a fixed frequency DatetimeIndex, with business day as the default @@ -1687,7 +1687,7 @@ def bdate_range( normalize=normalize, name=name, closed=closed, - **kwargs + **kwargs, ) diff --git a/pandas/core/indexes/interval.py b/pandas/core/indexes/interval.py index 1c4addfb44839..4a75ab58b7a65 100644 --- a/pandas/core/indexes/interval.py +++ b/pandas/core/indexes/interval.py @@ -935,7 +935,7 @@ def get_loc( None is specified as these are not yet implemented. """ ) - } + }, ) ) @Appender(_index_shared_docs["get_indexer"]) diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index e768a5b6dd23c..d8f9db06c5e8c 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -956,7 +956,7 @@ def copy( codes=None, deep=False, _set_identity=False, - **kwargs + **kwargs, ): """ Make a copy of this object. Names, dtype, levels and codes can be @@ -1020,7 +1020,7 @@ def _shallow_copy_with_infer(self, values, **kwargs): return MultiIndex( levels=[[] for _ in range(self.nlevels)], codes=[[] for _ in range(self.nlevels)], - **kwargs + **kwargs, ) return self._shallow_copy(values, **kwargs) diff --git a/pandas/core/indexes/period.py b/pandas/core/indexes/period.py index 2b70049fd14af..2df58b0bbc105 100644 --- a/pandas/core/indexes/period.py +++ b/pandas/core/indexes/period.py @@ -194,7 +194,7 @@ def __new__( dtype=None, copy=False, name=None, - **fields + **fields, ): valid_field_set = { diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 5508cf3ca522e..38e1f241c1d77 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -1088,7 +1088,7 @@ def interpolate( fill_value=None, coerce=False, downcast=None, - **kwargs + **kwargs, ): inplace = validate_bool_kwarg(inplace, "inplace") @@ -1138,7 +1138,7 @@ def check_int_bool(self, inplace): fill_value=fill_value, inplace=inplace, downcast=downcast, - **kwargs + **kwargs, ) def _interpolate_with_fill( @@ -1193,7 +1193,7 @@ def _interpolate( limit_area=None, inplace=False, downcast=None, - **kwargs + **kwargs, ): """ interpolate using scipy wrappers """ @@ -1231,7 +1231,7 @@ def func(x): limit_area=limit_area, fill_value=fill_value, bounds_error=False, - **kwargs + **kwargs, ) # interp each column independently @@ -2016,7 +2016,7 @@ def to_native_types( float_format=None, decimal=".", quoting=None, - **kwargs + **kwargs, ): """ convert to our native types format, slicing if desired """ diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index 6408da37d4343..96fd4c6bdc6e5 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -354,7 +354,7 @@ def apply( filter=None, do_integrity_check=False, consolidate=True, - **kwargs + **kwargs, ): """ iterate over the blocks, collect and create a new block manager diff --git a/pandas/core/missing.py b/pandas/core/missing.py index fb148b39c8a86..fc54c03c042b7 100644 --- a/pandas/core/missing.py +++ b/pandas/core/missing.py @@ -164,7 +164,7 @@ def interpolate_1d( fill_value=None, bounds_error=False, order=None, - **kwargs + **kwargs, ): """ Logic for the 1-d interpolation. The result should be 1-d, inputs @@ -300,7 +300,7 @@ def interpolate_1d( fill_value=fill_value, bounds_error=bounds_error, order=order, - **kwargs + **kwargs, ) result[preserve_nans] = np.nan return result diff --git a/pandas/core/resample.py b/pandas/core/resample.py index 79c808cb3951c..5bb0716728778 100644 --- a/pandas/core/resample.py +++ b/pandas/core/resample.py @@ -794,7 +794,7 @@ def interpolate( limit_direction="forward", limit_area=None, downcast=None, - **kwargs + **kwargs, ): """ Interpolate values according to different methods. @@ -808,7 +808,7 @@ def interpolate( limit_direction=limit_direction, limit_area=limit_area, downcast=downcast, - **kwargs + **kwargs, ) def asfreq(self, fill_value=None): @@ -1370,7 +1370,7 @@ def __init__( kind=None, convention=None, base=0, - **kwargs + **kwargs, ): # Check for correctness of the keyword arguments which would # otherwise silently use the default if misspelled diff --git a/pandas/core/reshape/pivot.py b/pandas/core/reshape/pivot.py index 404292fe4d539..ede33c5bd0258 100644 --- a/pandas/core/reshape/pivot.py +++ b/pandas/core/reshape/pivot.py @@ -570,7 +570,7 @@ def crosstab( margins=margins, margins_name=margins_name, dropna=dropna, - **kwargs + **kwargs, ) # Post-process diff --git a/pandas/core/series.py b/pandas/core/series.py index c5e639fef8c5b..d771aefb55844 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -3788,7 +3788,7 @@ def _gotitem(self, key, ndim, subset=None): see_also=_agg_see_also_doc, examples=_agg_examples_doc, versionadded="\n.. versionadded:: 0.20.0\n", - **_shared_doc_kwargs + **_shared_doc_kwargs, ) @Appender(generic._shared_docs["aggregate"]) def aggregate(self, func, axis=0, *args, **kwargs): @@ -4012,7 +4012,7 @@ def _reduce( skipna=skipna, numeric_only=numeric_only, filter_type=filter_type, - **kwds + **kwds, ) def _reindex_indexer(self, new_index, indexer, copy): @@ -4249,7 +4249,7 @@ def fillna( inplace=False, limit=None, downcast=None, - **kwargs + **kwargs, ): return super().fillna( value=value, @@ -4258,7 +4258,7 @@ def fillna( inplace=inplace, limit=limit, downcast=downcast, - **kwargs + **kwargs, ) @Appender(generic._shared_docs["replace"] % _shared_doc_kwargs) diff --git a/pandas/core/strings.py b/pandas/core/strings.py index 7194d1cf08e4a..a6e0c12526d8a 100644 --- a/pandas/core/strings.py +++ b/pandas/core/strings.py @@ -1875,7 +1875,7 @@ def _noarg_wrapper( docstring=None, forbidden_types=["bytes"], returns_string=True, - **kargs + **kargs, ): @forbid_nonstring_types(forbidden_types, name=name) def wrapper(self): @@ -1898,7 +1898,7 @@ def _pat_wrapper( name=None, forbidden_types=["bytes"], returns_string=True, - **kwargs + **kwargs, ): @forbid_nonstring_types(forbidden_types, name=name) def wrapper1(self, pat): diff --git a/pandas/core/window/rolling.py b/pandas/core/window/rolling.py index caf2f9e1c9dd3..fd221c53e244c 100644 --- a/pandas/core/window/rolling.py +++ b/pandas/core/window/rolling.py @@ -72,7 +72,7 @@ def __init__( axis: Axis = 0, on: Optional[Union[str, Index]] = None, closed: Optional[str] = None, - **kwargs + **kwargs, ): self.__dict__.update(kwargs) @@ -399,7 +399,7 @@ def _apply( window: Optional[Union[int, str]] = None, center: Optional[bool] = None, check_minp: Optional[Callable] = None, - **kwargs + **kwargs, ): """ Rolling statistical measure using supplied function. diff --git a/pandas/io/common.py b/pandas/io/common.py index 0bef14e4999c7..9b9fe21b56989 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -528,7 +528,7 @@ def __init__( file: FilePathOrBuffer, mode: str, archive_name: Optional[str] = None, - **kwargs + **kwargs, ): if mode in ["wb", "rb"]: mode = mode.replace("b", "") diff --git a/pandas/io/excel/_base.py b/pandas/io/excel/_base.py index d0ab6dd37596c..7ad7c40917b9c 100644 --- a/pandas/io/excel/_base.py +++ b/pandas/io/excel/_base.py @@ -304,7 +304,7 @@ def read_excel( skipfooter=0, convert_float=True, mangle_dupe_cols=True, - **kwds + **kwds, ): for arg in ("sheet", "sheetname", "parse_cols"): @@ -344,7 +344,7 @@ def read_excel( skipfooter=skipfooter, convert_float=convert_float, mangle_dupe_cols=mangle_dupe_cols, - **kwds + **kwds, ) @@ -417,7 +417,7 @@ def parse( skipfooter=0, convert_float=True, mangle_dupe_cols=True, - **kwds + **kwds, ): _validate_header_arg(header) @@ -517,7 +517,7 @@ def parse( skipfooter=skipfooter, usecols=usecols, mangle_dupe_cols=mangle_dupe_cols, - **kwds + **kwds, ) output[asheetname] = parser.read(nrows=nrows) @@ -694,7 +694,7 @@ def __init__( date_format=None, datetime_format=None, mode="w", - **engine_kwargs + **engine_kwargs, ): # validate that this engine can handle the extension if isinstance(path, str): @@ -848,7 +848,7 @@ def parse( skipfooter=0, convert_float=True, mangle_dupe_cols=True, - **kwds + **kwds, ): """ Parse specified sheet(s) into a DataFrame. @@ -886,7 +886,7 @@ def parse( skipfooter=skipfooter, convert_float=convert_float, mangle_dupe_cols=mangle_dupe_cols, - **kwds + **kwds, ) @property diff --git a/pandas/io/excel/_xlsxwriter.py b/pandas/io/excel/_xlsxwriter.py index 07bf265da4863..6d9ff9be5249a 100644 --- a/pandas/io/excel/_xlsxwriter.py +++ b/pandas/io/excel/_xlsxwriter.py @@ -168,7 +168,7 @@ def __init__( date_format=None, datetime_format=None, mode="w", - **engine_kwargs + **engine_kwargs, ): # Use the xlsxwriter module as the Excel writer. import xlsxwriter @@ -182,7 +182,7 @@ def __init__( date_format=date_format, datetime_format=datetime_format, mode=mode, - **engine_kwargs + **engine_kwargs, ) self.book = xlsxwriter.Workbook(path, **engine_kwargs) diff --git a/pandas/io/formats/format.py b/pandas/io/formats/format.py index 17603809c2ea6..41bddc7683764 100644 --- a/pandas/io/formats/format.py +++ b/pandas/io/formats/format.py @@ -1437,7 +1437,7 @@ def __init__( values: Union[np.ndarray, "Series", DatetimeIndex, DatetimeArray], nat_rep: str = "NaT", date_format: None = None, - **kwargs + **kwargs, ): super().__init__(values, **kwargs) self.nat_rep = nat_rep @@ -1658,7 +1658,7 @@ def __init__( values: Union[np.ndarray, TimedeltaIndex], nat_rep: str = "NaT", box: bool = False, - **kwargs + **kwargs, ): super().__init__(values, **kwargs) self.nat_rep = nat_rep diff --git a/pandas/io/gbq.py b/pandas/io/gbq.py index d29078cad9318..b120de1b3011a 100644 --- a/pandas/io/gbq.py +++ b/pandas/io/gbq.py @@ -173,7 +173,7 @@ def read_gbq( location=location, configuration=configuration, credentials=credentials, - **kwargs + **kwargs, ) diff --git a/pandas/io/parquet.py b/pandas/io/parquet.py index 69ee6583d12c8..edbf60cc91d0b 100644 --- a/pandas/io/parquet.py +++ b/pandas/io/parquet.py @@ -89,7 +89,7 @@ def write( coerce_timestamps="ms", index=None, partition_cols=None, - **kwargs + **kwargs, ): self.validate_dataframe(df) path, _, _, _ = get_filepath_or_buffer(path, mode="wb") @@ -106,7 +106,7 @@ def write( compression=compression, coerce_timestamps=coerce_timestamps, partition_cols=partition_cols, - **kwargs + **kwargs, ) else: self.api.parquet.write_table( @@ -114,7 +114,7 @@ def write( path, compression=compression, coerce_timestamps=coerce_timestamps, - **kwargs + **kwargs, ) def read(self, path, columns=None, **kwargs): @@ -176,7 +176,7 @@ def write( compression=compression, write_index=index, partition_on=partition_cols, - **kwargs + **kwargs, ) def read(self, path, columns=None, **kwargs): @@ -205,7 +205,7 @@ def to_parquet( compression="snappy", index=None, partition_cols=None, - **kwargs + **kwargs, ): """ Write a DataFrame to the parquet format. @@ -252,7 +252,7 @@ def to_parquet( compression=compression, index=index, partition_cols=partition_cols, - **kwargs + **kwargs, ) diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 058d65b9464ae..dba96358227c3 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -707,7 +707,7 @@ def read_fwf( colspecs="infer", widths=None, infer_nrows=100, - **kwds + **kwds, ): r""" diff --git a/pandas/plotting/_core.py b/pandas/plotting/_core.py index 6fc5b03920cba..6c3d70271fc12 100644 --- a/pandas/plotting/_core.py +++ b/pandas/plotting/_core.py @@ -23,7 +23,7 @@ def hist_series( figsize=None, bins=10, backend=None, - **kwargs + **kwargs, ): """ Draw histogram of the input series using matplotlib. @@ -83,7 +83,7 @@ def hist_series( yrot=yrot, figsize=figsize, bins=bins, - **kwargs + **kwargs, ) @@ -103,7 +103,7 @@ def hist_frame( layout=None, bins=10, backend=None, - **kwargs + **kwargs, ): """ Make a histogram of the DataFrame's. @@ -206,7 +206,7 @@ def hist_frame( figsize=figsize, layout=layout, bins=bins, - **kwargs + **kwargs, ) @@ -400,7 +400,7 @@ def boxplot( figsize=None, layout=None, return_type=None, - **kwargs + **kwargs, ): plot_backend = _get_plot_backend("matplotlib") return plot_backend.boxplot( @@ -414,7 +414,7 @@ def boxplot( figsize=figsize, layout=layout, return_type=return_type, - **kwargs + **kwargs, ) @@ -432,7 +432,7 @@ def boxplot_frame( layout=None, return_type=None, backend=None, - **kwargs + **kwargs, ): plot_backend = _get_plot_backend(backend) return plot_backend.boxplot_frame( @@ -446,7 +446,7 @@ def boxplot_frame( figsize=figsize, layout=layout, return_type=return_type, - **kwargs + **kwargs, ) @@ -463,7 +463,7 @@ def boxplot_frame_groupby( sharex=False, sharey=True, backend=None, - **kwargs + **kwargs, ): """ Make box plots from DataFrameGroupBy data. @@ -536,7 +536,7 @@ def boxplot_frame_groupby( layout=layout, sharex=sharex, sharey=sharey, - **kwargs + **kwargs, ) diff --git a/pandas/plotting/_matplotlib/boxplot.py b/pandas/plotting/_matplotlib/boxplot.py index cfd6c3519d82c..274f06cd3ec1d 100644 --- a/pandas/plotting/_matplotlib/boxplot.py +++ b/pandas/plotting/_matplotlib/boxplot.py @@ -184,7 +184,7 @@ def _grouped_plot_by_column( ax=None, layout=None, return_type=None, - **kwargs + **kwargs, ): grouped = data.groupby(by) if columns is None: @@ -234,7 +234,7 @@ def boxplot( figsize=None, layout=None, return_type=None, - **kwds + **kwds, ): import matplotlib.pyplot as plt @@ -359,7 +359,7 @@ def boxplot_frame( figsize=None, layout=None, return_type=None, - **kwds + **kwds, ): import matplotlib.pyplot as plt @@ -374,7 +374,7 @@ def boxplot_frame( figsize=figsize, layout=layout, return_type=return_type, - **kwds + **kwds, ) plt.draw_if_interactive() return ax @@ -392,7 +392,7 @@ def boxplot_frame_groupby( layout=None, sharex=False, sharey=True, - **kwds + **kwds, ): if subplots is True: naxes = len(grouped) @@ -432,6 +432,6 @@ def boxplot_frame_groupby( ax=ax, figsize=figsize, layout=layout, - **kwds + **kwds, ) return ret diff --git a/pandas/plotting/_matplotlib/core.py b/pandas/plotting/_matplotlib/core.py index 5853367f71d56..5341dc3a6338a 100644 --- a/pandas/plotting/_matplotlib/core.py +++ b/pandas/plotting/_matplotlib/core.py @@ -102,7 +102,7 @@ def __init__( table=False, layout=None, include_bool=False, - **kwds + **kwds, ): import matplotlib.pyplot as plt @@ -985,7 +985,7 @@ def _make_plot(self): c=c_values, label=label, cmap=cmap, - **self.kwds + **self.kwds, ) if cb: cbar_label = c if c_is_column else "" @@ -1095,7 +1095,7 @@ def _make_plot(self): column_num=i, stacking_id=stacking_id, is_errorbar=is_errorbar, - **kwds + **kwds, ) self._add_legend_handle(newlines[0], label, index=i) @@ -1250,7 +1250,7 @@ def _plot( column_num=None, stacking_id=None, is_errorbar=False, - **kwds + **kwds, ): if column_num == 0: @@ -1386,7 +1386,7 @@ def _make_plot(self): start=start, label=label, log=self.log, - **kwds + **kwds, ) ax.set_title(label) elif self.stacked: @@ -1401,7 +1401,7 @@ def _make_plot(self): start=start, label=label, log=self.log, - **kwds + **kwds, ) pos_prior = pos_prior + np.where(mask, y, 0) neg_prior = neg_prior + np.where(mask, 0, y) @@ -1415,7 +1415,7 @@ def _make_plot(self): start=start, label=label, log=self.log, - **kwds + **kwds, ) self._add_legend_handle(rect, label, index=i) diff --git a/pandas/plotting/_matplotlib/hist.py b/pandas/plotting/_matplotlib/hist.py index c4ac9ead3f3d3..b60e8fa8a3f7c 100644 --- a/pandas/plotting/_matplotlib/hist.py +++ b/pandas/plotting/_matplotlib/hist.py @@ -49,7 +49,7 @@ def _plot( bottom=0, column_num=0, stacking_id=None, - **kwds + **kwds, ): if column_num == 0: cls._initialize_stacker(ax, stacking_id, len(bins) - 1) @@ -145,7 +145,7 @@ def _plot( ind=None, column_num=None, stacking_id=None, - **kwds + **kwds, ): from scipy.stats import gaussian_kde @@ -177,7 +177,7 @@ def _grouped_plot( layout=None, rot=0, ax=None, - **kwargs + **kwargs, ): if figsize == "default": @@ -226,7 +226,7 @@ def _grouped_hist( xrot=None, ylabelsize=None, yrot=None, - **kwargs + **kwargs, ): """ Grouped histogram @@ -290,7 +290,7 @@ def hist_series( yrot=None, figsize=None, bins=10, - **kwds + **kwds, ): import matplotlib.pyplot as plt @@ -335,7 +335,7 @@ def hist_series( xrot=xrot, ylabelsize=ylabelsize, yrot=yrot, - **kwds + **kwds, ) if hasattr(axes, "ndim"): @@ -359,7 +359,7 @@ def hist_frame( figsize=None, layout=None, bins=10, - **kwds + **kwds, ): if by is not None: axes = _grouped_hist( @@ -377,7 +377,7 @@ def hist_frame( xrot=xrot, ylabelsize=ylabelsize, yrot=yrot, - **kwds + **kwds, ) return axes diff --git a/pandas/plotting/_matplotlib/misc.py b/pandas/plotting/_matplotlib/misc.py index 6d5a94c4d5ff8..6d2363668e650 100644 --- a/pandas/plotting/_matplotlib/misc.py +++ b/pandas/plotting/_matplotlib/misc.py @@ -22,7 +22,7 @@ def scatter_matrix( density_kwds=None, hist_kwds=None, range_padding=0.05, - **kwds + **kwds, ): df = frame._get_numeric_data() n = df.columns.size @@ -160,7 +160,7 @@ def normalize(series): to_plot[kls][1], color=colors[i], label=pprint_thing(kls), - **kwds + **kwds, ) ax.legend() @@ -315,7 +315,7 @@ def parallel_coordinates( axvlines=True, axvlines_kwds=None, sort_labels=False, - **kwds + **kwds, ): import matplotlib.pyplot as plt diff --git a/pandas/plotting/_matplotlib/tools.py b/pandas/plotting/_matplotlib/tools.py index caa0167c06389..bcbe5eea8b5ab 100644 --- a/pandas/plotting/_matplotlib/tools.py +++ b/pandas/plotting/_matplotlib/tools.py @@ -101,7 +101,7 @@ def _subplots( ax=None, layout=None, layout_type="box", - **fig_kw + **fig_kw, ): """Create a figure with a set of subplots already made. diff --git a/pandas/plotting/_misc.py b/pandas/plotting/_misc.py index 59cbdb0318e5a..6c8bcdada5957 100644 --- a/pandas/plotting/_misc.py +++ b/pandas/plotting/_misc.py @@ -82,7 +82,7 @@ def scatter_matrix( density_kwds=None, hist_kwds=None, range_padding=0.05, - **kwargs + **kwargs, ): """ Draw a matrix of scatter plots. @@ -134,7 +134,7 @@ def scatter_matrix( density_kwds=density_kwds, hist_kwds=hist_kwds, range_padding=range_padding, - **kwargs + **kwargs, ) @@ -207,7 +207,7 @@ def radviz(frame, class_column, ax=None, color=None, colormap=None, **kwds): ax=ax, color=color, colormap=colormap, - **kwds + **kwds, ) @@ -255,7 +255,7 @@ def andrews_curves( samples=samples, color=color, colormap=colormap, - **kwargs + **kwargs, ) @@ -325,7 +325,7 @@ def parallel_coordinates( axvlines=True, axvlines_kwds=None, sort_labels=False, - **kwargs + **kwargs, ): """ Parallel coordinates plotting. @@ -383,7 +383,7 @@ def parallel_coordinates( axvlines=axvlines, axvlines_kwds=axvlines_kwds, sort_labels=sort_labels, - **kwargs + **kwargs, ) diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py index a187781ea214c..663e03aa1bc81 100644 --- a/pandas/tests/groupby/test_categorical.py +++ b/pandas/tests/groupby/test_categorical.py @@ -1127,7 +1127,7 @@ def test_seriesgroupby_observed_true(df_cat, operation, kwargs): index = MultiIndex.from_frame( DataFrame( {"A": ["foo", "foo", "bar", "bar"], "B": ["one", "two", "one", "three"]}, - **kwargs + **kwargs, ) ) expected = Series(data=[1, 3, 2, 4], index=index, name="C") diff --git a/pandas/tests/reshape/test_pivot.py b/pandas/tests/reshape/test_pivot.py index a8386d21ba27f..5b6dc70894857 100644 --- a/pandas/tests/reshape/test_pivot.py +++ b/pandas/tests/reshape/test_pivot.py @@ -581,23 +581,23 @@ def test_pivot_tz_in_values(self): df = pd.DataFrame( [ { - "uid": u"aa", + "uid": "aa", "ts": pd.Timestamp("2016-08-12 13:00:00-0700", tz="US/Pacific"), }, { - "uid": u"aa", + "uid": "aa", "ts": pd.Timestamp("2016-08-12 08:00:00-0700", tz="US/Pacific"), }, { - "uid": u"aa", + "uid": "aa", "ts": pd.Timestamp("2016-08-12 14:00:00-0700", tz="US/Pacific"), }, { - "uid": u"aa", + "uid": "aa", "ts": pd.Timestamp("2016-08-25 11:00:00-0700", tz="US/Pacific"), }, { - "uid": u"aa", + "uid": "aa", "ts": pd.Timestamp("2016-08-25 13:00:00-0700", tz="US/Pacific"), }, ] diff --git a/pandas/tests/test_multilevel.py b/pandas/tests/test_multilevel.py index 3b194044131a8..cd6acafc394c5 100644 --- a/pandas/tests/test_multilevel.py +++ b/pandas/tests/test_multilevel.py @@ -363,19 +363,19 @@ def test_unstack(self): [ ( [[1, 1, None, None, 30.0, None], [2, 2, None, None, 30.0, None]], - [u"ix1", u"ix2", u"col1", u"col2", u"col3", u"col4"], + ["ix1", "ix2", "col1", "col2", "col3", "col4"], 2, [None, None, 30.0, None], ), ( [[1, 1, None, None, 30.0], [2, 2, None, None, 30.0]], - [u"ix1", u"ix2", u"col1", u"col2", u"col3"], + ["ix1", "ix2", "col1", "col2", "col3"], 2, [None, None, 30.0], ), ( [[1, 1, None, None, 30.0], [2, None, None, None, 30.0]], - [u"ix1", u"ix2", u"col1", u"col2", u"col3"], + ["ix1", "ix2", "col1", "col2", "col3"], None, [None, None, 30.0], ), @@ -389,7 +389,7 @@ def test_unstack_partial( # make sure DataFrame.unstack() works when its run on a subset of the DataFrame # and the Index levels contain values that are not present in the subset result = pd.DataFrame(result_rows, columns=result_columns).set_index( - [u"ix1", "ix2"] + ["ix1", "ix2"] ) result = result.iloc[1:2].unstack("ix2") expected = pd.DataFrame( diff --git a/pandas/tests/test_nanops.py b/pandas/tests/test_nanops.py index 52ad56967220f..7b76a1c0a6c86 100644 --- a/pandas/tests/test_nanops.py +++ b/pandas/tests/test_nanops.py @@ -151,7 +151,7 @@ def check_fun_data( targarval, check_dtype=True, empty_targfunc=None, - **kwargs + **kwargs, ): for axis in list(range(targarval.ndim)) + [None]: for skipna in [False, True]: @@ -186,7 +186,7 @@ def check_fun_data( targarval2, check_dtype=check_dtype, empty_targfunc=empty_targfunc, - **kwargs + **kwargs, ) def check_fun(self, testfunc, targfunc, testar, empty_targfunc=None, **kwargs): @@ -203,7 +203,7 @@ def check_fun(self, testfunc, targfunc, testar, empty_targfunc=None, **kwargs): testarval, targarval, empty_targfunc=empty_targfunc, - **kwargs + **kwargs, ) def check_funs( @@ -215,7 +215,7 @@ def check_funs( allow_date=True, allow_tdelta=True, allow_obj=True, - **kwargs + **kwargs, ): self.check_fun(testfunc, targfunc, "arr_float", **kwargs) self.check_fun(testfunc, targfunc, "arr_float_nan", **kwargs) @@ -476,7 +476,7 @@ def check_nancorr_nancov_2d(self, checkfun, targ0, targ1, **kwargs): self.arr_float_2d, self.arr_float1_2d, min_periods=len(self.arr_float_2d) - 1, - **kwargs + **kwargs, ) tm.assert_almost_equal(targ0, res00) tm.assert_almost_equal(targ0, res01) @@ -486,7 +486,7 @@ def check_nancorr_nancov_2d(self, checkfun, targ0, targ1, **kwargs): self.arr_float_nan_2d, self.arr_float1_nan_2d, min_periods=len(self.arr_float_2d) - 1, - **kwargs + **kwargs, ) tm.assert_almost_equal(targ1, res10) tm.assert_almost_equal(targ1, res11) @@ -500,13 +500,13 @@ def check_nancorr_nancov_2d(self, checkfun, targ0, targ1, **kwargs): self.arr_float_nan_2d, self.arr_nan_float1_2d, min_periods=len(self.arr_float_2d) - 1, - **kwargs + **kwargs, ) res25 = checkfun( self.arr_float_2d, self.arr_float1_2d, min_periods=len(self.arr_float_2d) + 1, - **kwargs + **kwargs, ) tm.assert_almost_equal(targ2, res20) tm.assert_almost_equal(targ2, res21) @@ -521,7 +521,7 @@ def check_nancorr_nancov_1d(self, checkfun, targ0, targ1, **kwargs): self.arr_float_1d, self.arr_float1_1d, min_periods=len(self.arr_float_1d) - 1, - **kwargs + **kwargs, ) tm.assert_almost_equal(targ0, res00) tm.assert_almost_equal(targ0, res01) @@ -531,7 +531,7 @@ def check_nancorr_nancov_1d(self, checkfun, targ0, targ1, **kwargs): self.arr_float_nan_1d, self.arr_float1_nan_1d, min_periods=len(self.arr_float_1d) - 1, - **kwargs + **kwargs, ) tm.assert_almost_equal(targ1, res10) tm.assert_almost_equal(targ1, res11) @@ -545,13 +545,13 @@ def check_nancorr_nancov_1d(self, checkfun, targ0, targ1, **kwargs): self.arr_float_nan_1d, self.arr_nan_float1_1d, min_periods=len(self.arr_float_1d) - 1, - **kwargs + **kwargs, ) res25 = checkfun( self.arr_float_1d, self.arr_float1_1d, min_periods=len(self.arr_float_1d) + 1, - **kwargs + **kwargs, ) tm.assert_almost_equal(targ2, res20) tm.assert_almost_equal(targ2, res21) diff --git a/pandas/tests/window/test_moments.py b/pandas/tests/window/test_moments.py index 36a0ddb3e02d7..6e4bc621d7f49 100644 --- a/pandas/tests/window/test_moments.py +++ b/pandas/tests/window/test_moments.py @@ -800,7 +800,7 @@ def _check_moment_func( has_time_rule=True, fill_value=None, zero_min_periods_equal=True, - **kwargs + **kwargs, ): # inject raw diff --git a/pandas/util/testing.py b/pandas/util/testing.py index 4ba32c377a345..2b4cb322fc966 100644 --- a/pandas/util/testing.py +++ b/pandas/util/testing.py @@ -316,7 +316,7 @@ def assert_almost_equal( check_exact=False, exact=check_dtype, check_less_precise=check_less_precise, - **kwargs + **kwargs, ) elif isinstance(left, pd.Series): @@ -326,7 +326,7 @@ def assert_almost_equal( check_exact=False, check_dtype=check_dtype, check_less_precise=check_less_precise, - **kwargs + **kwargs, ) elif isinstance(left, pd.DataFrame): @@ -336,7 +336,7 @@ def assert_almost_equal( check_exact=False, check_dtype=check_dtype, check_less_precise=check_less_precise, - **kwargs + **kwargs, ) else: @@ -359,7 +359,7 @@ def assert_almost_equal( right, check_dtype=check_dtype, check_less_precise=check_less_precise, - **kwargs + **kwargs, ) diff --git a/pyproject.toml b/pyproject.toml index b105f8aeb3291..28d7c3d55c919 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -10,3 +10,23 @@ requires = [ "numpy==1.16.0; python_version=='3.6' and platform_system=='AIX'", "numpy==1.16.0; python_version>='3.7' and platform_system=='AIX'", ] + +[tool.black] +target-version = ['py36', 'py37', 'py38'] +exclude = ''' +( + asv_bench/env + | \.egg + | \.git + | \.hg + | \.mypy_cache + | \.nox + | \.tox + | \.venv + | _build + | buck-out + | build + | dist + | setup.py +) +''' From 71b78685aaf9c6fbe1786774e98cb5df54e87cd5 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Thu, 14 Nov 2019 11:53:20 -0800 Subject: [PATCH 059/185] REF: eliminate eval_kwargs (#29611) --- pandas/core/computation/expressions.py | 13 ++++---- pandas/core/ops/__init__.py | 41 ++------------------------ pandas/core/ops/array_ops.py | 19 +++++------- 3 files changed, 15 insertions(+), 58 deletions(-) diff --git a/pandas/core/computation/expressions.py b/pandas/core/computation/expressions.py index 46bc762e1a0b3..77999d2c166fd 100644 --- a/pandas/core/computation/expressions.py +++ b/pandas/core/computation/expressions.py @@ -62,9 +62,8 @@ def set_numexpr_threads(n=None): ne.set_num_threads(n) -def _evaluate_standard(op, op_str, a, b, reversed=False): +def _evaluate_standard(op, op_str, a, b): """ standard evaluation """ - # `reversed` kwarg is included for compatibility with _evaluate_numexpr if _TEST_MODE: _store_test_result(False) with np.errstate(all="ignore"): @@ -97,11 +96,12 @@ def _can_use_numexpr(op, op_str, a, b, dtype_check): return False -def _evaluate_numexpr(op, op_str, a, b, reversed=False): +def _evaluate_numexpr(op, op_str, a, b): result = None if _can_use_numexpr(op, op_str, a, b, "evaluate"): - if reversed: + is_reversed = op.__name__.strip("_").startswith("r") + if is_reversed: # we were originally called by a reversed op method a, b = b, a @@ -190,7 +190,7 @@ def _bool_arith_check( return True -def evaluate(op, op_str, a, b, use_numexpr=True, reversed=False): +def evaluate(op, op_str, a, b, use_numexpr=True): """ Evaluate and return the expression of the op on a and b. @@ -203,12 +203,11 @@ def evaluate(op, op_str, a, b, use_numexpr=True, reversed=False): b : right operand use_numexpr : bool, default True Whether to try to use numexpr. - reversed : bool, default False """ use_numexpr = use_numexpr and _bool_arith_check(op_str, a, b) if use_numexpr: - return _evaluate(op, op_str, a, b, reversed=reversed) + return _evaluate(op, op_str, a, b) return _evaluate_standard(op, op_str, a, b) diff --git a/pandas/core/ops/__init__.py b/pandas/core/ops/__init__.py index f7a1258894b89..d14fb040c4e30 100644 --- a/pandas/core/ops/__init__.py +++ b/pandas/core/ops/__init__.py @@ -182,41 +182,6 @@ def maybe_upcast_for_op(obj, shape: Tuple[int, ...]): # ----------------------------------------------------------------------------- -def _gen_eval_kwargs(name): - """ - Find the keyword arguments to pass to numexpr for the given operation. - - Parameters - ---------- - name : str - - Returns - ------- - eval_kwargs : dict - - Examples - -------- - >>> _gen_eval_kwargs("__add__") - {} - - >>> _gen_eval_kwargs("rtruediv") - {'reversed': True, 'truediv': True} - """ - kwargs = {} - - # Series appear to only pass __add__, __radd__, ... - # but DataFrame gets both these dunder names _and_ non-dunder names - # add, radd, ... - name = name.replace("__", "") - - if name.startswith("r"): - if name not in ["radd", "rand", "ror", "rxor"]: - # Exclude commutative operations - kwargs["reversed"] = True - - return kwargs - - def _get_frame_op_default_axis(name): """ Only DataFrame cares about default_axis, specifically: @@ -488,7 +453,6 @@ def _arith_method_SERIES(cls, op, special): """ str_rep = _get_opstr(op) op_name = _get_op_name(op, special) - eval_kwargs = _gen_eval_kwargs(op_name) @unpack_zerodim_and_defer(op_name) def wrapper(left, right): @@ -497,7 +461,7 @@ def wrapper(left, right): res_name = get_op_result_name(left, right) lvalues = extract_array(left, extract_numpy=True) - result = arithmetic_op(lvalues, right, op, str_rep, eval_kwargs) + result = arithmetic_op(lvalues, right, op, str_rep) return _construct_result(left, result, index=left.index, name=res_name) @@ -682,10 +646,9 @@ def to_series(right): def _arith_method_FRAME(cls, op, special): str_rep = _get_opstr(op) op_name = _get_op_name(op, special) - eval_kwargs = _gen_eval_kwargs(op_name) default_axis = _get_frame_op_default_axis(op_name) - na_op = define_na_arithmetic_op(op, str_rep, eval_kwargs) + na_op = define_na_arithmetic_op(op, str_rep) is_logical = str_rep in ["&", "|", "^"] if op_name in _op_descriptions: diff --git a/pandas/core/ops/array_ops.py b/pandas/core/ops/array_ops.py index 46c3b8b575af9..414e241af7bbd 100644 --- a/pandas/core/ops/array_ops.py +++ b/pandas/core/ops/array_ops.py @@ -3,7 +3,7 @@ ExtensionArrays. """ import operator -from typing import Any, Mapping, Union +from typing import Any, Union import numpy as np @@ -118,14 +118,14 @@ def masked_arith_op(x, y, op): return result -def define_na_arithmetic_op(op, str_rep: str, eval_kwargs): +def define_na_arithmetic_op(op, str_rep: str): def na_op(x, y): - return na_arithmetic_op(x, y, op, str_rep, eval_kwargs) + return na_arithmetic_op(x, y, op, str_rep) return na_op -def na_arithmetic_op(left, right, op, str_rep: str, eval_kwargs): +def na_arithmetic_op(left, right, op, str_rep: str): """ Return the result of evaluating op on the passed in values. @@ -136,7 +136,6 @@ def na_arithmetic_op(left, right, op, str_rep: str, eval_kwargs): left : np.ndarray right : np.ndarray or scalar str_rep : str or None - eval_kwargs : kwargs to pass to expressions Returns ------- @@ -149,7 +148,7 @@ def na_arithmetic_op(left, right, op, str_rep: str, eval_kwargs): import pandas.core.computation.expressions as expressions try: - result = expressions.evaluate(op, str_rep, left, right, **eval_kwargs) + result = expressions.evaluate(op, str_rep, left, right) except TypeError: result = masked_arith_op(left, right, op) @@ -157,11 +156,7 @@ def na_arithmetic_op(left, right, op, str_rep: str, eval_kwargs): def arithmetic_op( - left: Union[np.ndarray, ABCExtensionArray], - right: Any, - op, - str_rep: str, - eval_kwargs: Mapping[str, bool], + left: Union[np.ndarray, ABCExtensionArray], right: Any, op, str_rep: str ): """ Evaluate an arithmetic operation `+`, `-`, `*`, `/`, `//`, `%`, `**`, ... @@ -212,7 +207,7 @@ def arithmetic_op( else: with np.errstate(all="ignore"): - res_values = na_arithmetic_op(lvalues, rvalues, op, str_rep, eval_kwargs) + res_values = na_arithmetic_op(lvalues, rvalues, op, str_rep) return res_values From f7d6b584f83babf3f5c6d1610f6150d809be460f Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Fri, 15 Nov 2019 06:53:29 -0800 Subject: [PATCH 060/185] REF: eliminate statefulness in FrameApply (#29620) --- pandas/core/apply.py | 54 +++++++++++++++++++++----------------------- 1 file changed, 26 insertions(+), 28 deletions(-) diff --git a/pandas/core/apply.py b/pandas/core/apply.py index 94b7c59b93563..071cd116ea982 100644 --- a/pandas/core/apply.py +++ b/pandas/core/apply.py @@ -1,6 +1,6 @@ import abc import inspect -from typing import TYPE_CHECKING, Any, Dict, Iterator, Optional, Type, Union +from typing import TYPE_CHECKING, Any, Dict, Iterator, Tuple, Type, Union import numpy as np @@ -72,7 +72,9 @@ def series_generator(self) -> Iterator["Series"]: pass @abc.abstractmethod - def wrap_results_for_axis(self, results: ResType) -> Union["Series", "DataFrame"]: + def wrap_results_for_axis( + self, results: ResType, res_index: "Index" + ) -> Union["Series", "DataFrame"]: pass # --------------------------------------------------------------- @@ -112,15 +114,6 @@ def f(x): self.f = f - # results - self.result = None - self._res_index: Optional["Index"] = None - - @property - def res_index(self) -> "Index": - assert self._res_index is not None - return self._res_index - @property def res_columns(self) -> "Index": return self.result_columns @@ -313,12 +306,12 @@ def apply_standard(self): return self.obj._constructor_sliced(result, index=labels) # compute the result using the series generator - results = self.apply_series_generator() + results, res_index = self.apply_series_generator() # wrap results - return self.wrap_results(results) + return self.wrap_results(results, res_index) - def apply_series_generator(self) -> ResType: + def apply_series_generator(self) -> Tuple[ResType, "Index"]: series_gen = self.series_generator res_index = self.result_index @@ -345,19 +338,20 @@ def apply_series_generator(self) -> ResType: results[i] = self.f(v) keys.append(v.name) - self._res_index = res_index - return results + return results, res_index - def wrap_results(self, results: ResType) -> Union["Series", "DataFrame"]: + def wrap_results( + self, results: ResType, res_index: "Index" + ) -> Union["Series", "DataFrame"]: # see if we can infer the results if len(results) > 0 and 0 in results and is_sequence(results[0]): - return self.wrap_results_for_axis(results) + return self.wrap_results_for_axis(results, res_index) # dict of scalars result = self.obj._constructor_sliced(results) - result.index = self.res_index + result.index = res_index return result @@ -380,7 +374,9 @@ def result_index(self) -> "Index": def result_columns(self) -> "Index": return self.index - def wrap_results_for_axis(self, results: ResType) -> "DataFrame": + def wrap_results_for_axis( + self, results: ResType, res_index: "Index" + ) -> "DataFrame": """ return the results for the rows """ result = self.obj._constructor(data=results) @@ -389,8 +385,8 @@ def wrap_results_for_axis(self, results: ResType) -> "DataFrame": if len(result.index) == len(self.res_columns): result.index = self.res_columns - if len(result.columns) == len(self.res_index): - result.columns = self.res_index + if len(result.columns) == len(res_index): + result.columns = res_index return result @@ -418,35 +414,37 @@ def result_index(self) -> "Index": def result_columns(self) -> "Index": return self.columns - def wrap_results_for_axis(self, results: ResType) -> Union["Series", "DataFrame"]: + def wrap_results_for_axis( + self, results: ResType, res_index: "Index" + ) -> Union["Series", "DataFrame"]: """ return the results for the columns """ result: Union["Series", "DataFrame"] # we have requested to expand if self.result_type == "expand": - result = self.infer_to_same_shape(results) + result = self.infer_to_same_shape(results, res_index) # we have a non-series and don't want inference elif not isinstance(results[0], ABCSeries): from pandas import Series result = Series(results) - result.index = self.res_index + result.index = res_index # we may want to infer results else: - result = self.infer_to_same_shape(results) + result = self.infer_to_same_shape(results, res_index) return result - def infer_to_same_shape(self, results: ResType) -> "DataFrame": + def infer_to_same_shape(self, results: ResType, res_index: "Index") -> "DataFrame": """ infer the results to the same shape as the input object """ result = self.obj._constructor(data=results) result = result.T # set the index - result.index = self.res_index + result.index = res_index # infer dtypes result = result.infer_objects() From 710d82c0d393c9031e469ec0371660d8187b7dc3 Mon Sep 17 00:00:00 2001 From: Jung Dong Ho Date: Fri, 15 Nov 2019 23:55:05 +0900 Subject: [PATCH 061/185] BUG: make pct_change can handle the anchored freq #28664 (#28681) --- doc/source/whatsnew/v1.0.0.rst | 1 + pandas/core/generic.py | 1 + pandas/tests/series/test_timeseries.py | 10 ++++++++++ 3 files changed, 12 insertions(+) diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index 90606fb61ada8..3b7756256dcab 100644 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -440,6 +440,7 @@ Reshaping - :func:`qcut` and :func:`cut` now handle boolean input (:issue:`20303`) - Fix to ensure all int dtypes can be used in :func:`merge_asof` when using a tolerance value. Previously every non-int64 type would raise an erroneous ``MergeError`` (:issue:`28870`). - Better error message in :func:`get_dummies` when `columns` isn't a list-like value (:issue:`28383`) +- Bug :meth:`Series.pct_change` where supplying an anchored frequency would throw a ValueError (:issue:`28664`) Sparse ^^^^^^ diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 4e024bba3c999..17784b623c414 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -10443,6 +10443,7 @@ def pct_change(self, periods=1, fill_method="pad", limit=None, freq=None, **kwar data = self.fillna(method=fill_method, limit=limit, axis=axis) rs = data.div(data.shift(periods=periods, freq=freq, axis=axis, **kwargs)) - 1 + rs = rs.loc[~rs.index.duplicated()] rs = rs.reindex_like(data) if freq is None: mask = isna(com.values_from_object(data)) diff --git a/pandas/tests/series/test_timeseries.py b/pandas/tests/series/test_timeseries.py index 7154975c6c73b..4ae00bca3e832 100644 --- a/pandas/tests/series/test_timeseries.py +++ b/pandas/tests/series/test_timeseries.py @@ -370,6 +370,16 @@ def test_pct_change(self, datetime_series): rs, (filled / filled.shift(freq="5D") - 1).reindex_like(filled) ) + def test_pct_change_with_duplicate_axis(self): + # GH 28664 + common_idx = date_range("2019-11-14", periods=5, freq="D") + result = Series(range(5), common_idx).pct_change(freq="B") + + # the reason that the expected should be like this is documented at PR 28681 + expected = Series([np.NaN, np.inf, np.NaN, np.NaN, 3.0], common_idx) + + tm.assert_series_equal(result, expected) + def test_pct_change_shift_over_nas(self): s = Series([1.0, 1.5, np.nan, 2.5, 3.0]) From 7ba0e0281da63b3496dc8b0382f9c5e03810fdcc Mon Sep 17 00:00:00 2001 From: MomIsBestFriend <50263213+MomIsBestFriend@users.noreply.github.com> Date: Fri, 15 Nov 2019 16:56:01 +0200 Subject: [PATCH 062/185] CLN:Typing in pandas/core/dtypes/ (#29606) --- pandas/core/dtypes/base.py | 4 ++-- pandas/core/dtypes/common.py | 6 +++--- pandas/core/dtypes/concat.py | 2 +- pandas/core/dtypes/dtypes.py | 2 +- pandas/core/dtypes/missing.py | 4 ++-- 5 files changed, 9 insertions(+), 9 deletions(-) diff --git a/pandas/core/dtypes/base.py b/pandas/core/dtypes/base.py index d9d3b0d45e218..c90f1cdeaabfd 100644 --- a/pandas/core/dtypes/base.py +++ b/pandas/core/dtypes/base.py @@ -86,7 +86,7 @@ def __from_arrow__( def __str__(self) -> str: return self.name - def __eq__(self, other): + def __eq__(self, other) -> bool: """ Check whether 'other' is equal to self. @@ -119,7 +119,7 @@ def __eq__(self, other): def __hash__(self) -> int: return hash(tuple(getattr(self, attr) for attr in self._metadata)) - def __ne__(self, other): + def __ne__(self, other) -> bool: return not self.__eq__(other) @property diff --git a/pandas/core/dtypes/common.py b/pandas/core/dtypes/common.py index 635bc5ce03680..dcc8a274492ee 100644 --- a/pandas/core/dtypes/common.py +++ b/pandas/core/dtypes/common.py @@ -726,7 +726,7 @@ def is_string_dtype(arr_or_dtype) -> bool: """ # TODO: gh-15585: consider making the checks stricter. - def condition(dtype): + def condition(dtype) -> bool: return dtype.kind in ("O", "S", "U") and not is_period_dtype(dtype) return _is_dtype(arr_or_dtype, condition) @@ -1496,7 +1496,7 @@ def is_bool_dtype(arr_or_dtype) -> bool: return issubclass(dtype.type, np.bool_) -def is_extension_type(arr): +def is_extension_type(arr) -> bool: """ Check whether an array-like is of a pandas extension class instance. @@ -1561,7 +1561,7 @@ def is_extension_type(arr): return False -def is_extension_array_dtype(arr_or_dtype): +def is_extension_array_dtype(arr_or_dtype) -> bool: """ Check if an object is a pandas extension array type. diff --git a/pandas/core/dtypes/concat.py b/pandas/core/dtypes/concat.py index 768272e173c82..7b3e7d4f42121 100644 --- a/pandas/core/dtypes/concat.py +++ b/pandas/core/dtypes/concat.py @@ -88,7 +88,7 @@ def concat_compat(to_concat, axis: int = 0): # filter empty arrays # 1-d dtypes always are included here - def is_nonempty(x): + def is_nonempty(x) -> bool: if x.ndim <= axis: return True return x.shape[axis] > 0 diff --git a/pandas/core/dtypes/dtypes.py b/pandas/core/dtypes/dtypes.py index a0712a0df237b..3d1388db371ca 100644 --- a/pandas/core/dtypes/dtypes.py +++ b/pandas/core/dtypes/dtypes.py @@ -51,7 +51,7 @@ def register_extension_dtype(cls: Type[ExtensionDtype]) -> Type[ExtensionDtype]: class Registry: """ - Registry for dtype inference + Registry for dtype inference. The registry allows one to map a string repr of a extension dtype to an extension dtype. The string alias can be used in several diff --git a/pandas/core/dtypes/missing.py b/pandas/core/dtypes/missing.py index df89bd374f22e..aeba4eebc498e 100644 --- a/pandas/core/dtypes/missing.py +++ b/pandas/core/dtypes/missing.py @@ -375,7 +375,7 @@ def notna(obj): notnull = notna -def _isna_compat(arr, fill_value=np.nan): +def _isna_compat(arr, fill_value=np.nan) -> bool: """ Parameters ---------- @@ -392,7 +392,7 @@ def _isna_compat(arr, fill_value=np.nan): return True -def array_equivalent(left, right, strict_nan: bool = False): +def array_equivalent(left, right, strict_nan: bool = False) -> bool: """ True if two arrays, left and right, have equal non-NaN elements, and NaNs in corresponding locations. False otherwise. It is assumed that left and From 17460997f4c9837647425c533fb6602d988fba43 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Fri, 15 Nov 2019 06:59:59 -0800 Subject: [PATCH 063/185] REF: de-privatize indexes.api names (#29495) --- pandas/core/groupby/generic.py | 4 ++-- pandas/core/index.py | 6 +---- pandas/core/indexes/api.py | 33 +++++++++++++-------------- pandas/core/internals/construction.py | 14 ++++-------- pandas/core/reshape/concat.py | 14 ++++++------ pandas/core/reshape/pivot.py | 4 ++-- pandas/tests/indexes/test_base.py | 8 ++----- 7 files changed, 35 insertions(+), 48 deletions(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 002d8640f109d..9073a1e31dfb0 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -63,7 +63,7 @@ _transform_template, get_groupby, ) -from pandas.core.index import Index, MultiIndex, _all_indexes_same +from pandas.core.indexes.api import Index, MultiIndex, all_indexes_same import pandas.core.indexes.base as ibase from pandas.core.internals import BlockManager, make_block from pandas.core.series import Series @@ -1186,7 +1186,7 @@ def first_not_none(values): if isinstance(v, (np.ndarray, Index, Series)): if isinstance(v, Series): applied_index = self._selected_obj._get_axis(self.axis) - all_indexed_same = _all_indexes_same([x.index for x in values]) + all_indexed_same = all_indexes_same([x.index for x in values]) singular_series = len(values) == 1 and applied_index.nlevels == 1 # GH3596 diff --git a/pandas/core/index.py b/pandas/core/index.py index d308ac1a9b1c7..84b37b8bd659d 100644 --- a/pandas/core/index.py +++ b/pandas/core/index.py @@ -13,13 +13,9 @@ RangeIndex, TimedeltaIndex, UInt64Index, - _all_indexes_same, - _get_combined_index, - _get_consensus_names, - _get_objs_combined_axis, _new_Index, - _union_indexes, ensure_index, ensure_index_from_sequences, + get_objs_combined_axis, ) from pandas.core.indexes.multi import _sparsify # noqa:F401 diff --git a/pandas/core/indexes/api.py b/pandas/core/indexes/api.py index 86d55ce2e7cc3..a7cf2c20b0dec 100644 --- a/pandas/core/indexes/api.py +++ b/pandas/core/indexes/api.py @@ -6,23 +6,23 @@ import pandas.core.common as com from pandas.core.indexes.base import ( Index, + InvalidIndexError, _new_Index, ensure_index, ensure_index_from_sequences, ) -from pandas.core.indexes.base import InvalidIndexError # noqa:F401 -from pandas.core.indexes.category import CategoricalIndex # noqa:F401 +from pandas.core.indexes.category import CategoricalIndex from pandas.core.indexes.datetimes import DatetimeIndex -from pandas.core.indexes.interval import IntervalIndex # noqa:F401 -from pandas.core.indexes.multi import MultiIndex # noqa:F401 -from pandas.core.indexes.numeric import ( # noqa:F401 +from pandas.core.indexes.interval import IntervalIndex +from pandas.core.indexes.multi import MultiIndex +from pandas.core.indexes.numeric import ( Float64Index, Int64Index, NumericIndex, UInt64Index, ) from pandas.core.indexes.period import PeriodIndex -from pandas.core.indexes.range import RangeIndex # noqa:F401 +from pandas.core.indexes.range import RangeIndex from pandas.core.indexes.timedeltas import TimedeltaIndex _sort_msg = textwrap.dedent( @@ -57,15 +57,14 @@ "NaT", "ensure_index", "ensure_index_from_sequences", - "_get_combined_index", - "_get_objs_combined_axis", - "_union_indexes", - "_get_consensus_names", - "_all_indexes_same", + "get_objs_combined_axis", + "union_indexes", + "get_consensus_names", + "all_indexes_same", ] -def _get_objs_combined_axis(objs, intersect=False, axis=0, sort=True): +def get_objs_combined_axis(objs, intersect=False, axis=0, sort=True): """ Extract combined index: return intersection or union (depending on the value of "intersect") of indexes on given axis, or None if all objects @@ -137,7 +136,7 @@ def _get_combined_index(indexes, intersect=False, sort=False): for other in indexes[1:]: index = index.intersection(other) else: - index = _union_indexes(indexes, sort=sort) + index = union_indexes(indexes, sort=sort) index = ensure_index(index) if sort: @@ -148,7 +147,7 @@ def _get_combined_index(indexes, intersect=False, sort=False): return index -def _union_indexes(indexes, sort=True): +def union_indexes(indexes, sort=True): """ Return the union of indexes. @@ -217,7 +216,7 @@ def conv(i): return _unique_indices(indexes) - name = _get_consensus_names(indexes)[0] + name = get_consensus_names(indexes)[0] if name != index.name: index = index._shallow_copy(name=name) return index @@ -264,7 +263,7 @@ def _sanitize_and_check(indexes): return indexes, "array" -def _get_consensus_names(indexes): +def get_consensus_names(indexes): """ Give a consensus 'names' to indexes. @@ -289,7 +288,7 @@ def _get_consensus_names(indexes): return [None] * indexes[0].nlevels -def _all_indexes_same(indexes): +def all_indexes_same(indexes): """ Determine if all indexes contain the same elements. diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py index c24fb0a7dc39a..2980deb9a052c 100644 --- a/pandas/core/internals/construction.py +++ b/pandas/core/internals/construction.py @@ -37,13 +37,9 @@ from pandas.core import algorithms, common as com from pandas.core.arrays import Categorical from pandas.core.construction import sanitize_array -from pandas.core.index import ( - Index, - _get_objs_combined_axis, - _union_indexes, - ensure_index, -) +from pandas.core.index import Index, ensure_index, get_objs_combined_axis from pandas.core.indexes import base as ibase +from pandas.core.indexes.api import union_indexes from pandas.core.internals import ( create_block_manager_from_arrays, create_block_manager_from_blocks, @@ -345,9 +341,9 @@ def extract_index(data): raise ValueError("If using all scalar values, you must pass an index") if have_series: - index = _union_indexes(indexes) + index = union_indexes(indexes) elif have_dicts: - index = _union_indexes(indexes, sort=False) + index = union_indexes(indexes, sort=False) if have_raw_arrays: lengths = list(set(raw_lengths)) @@ -493,7 +489,7 @@ def _list_to_arrays(data, columns, coerce_float=False, dtype=None): def _list_of_series_to_arrays(data, columns, coerce_float=False, dtype=None): if columns is None: - columns = _get_objs_combined_axis(data, sort=False) + columns = get_objs_combined_axis(data, sort=False) indexer_cache = {} diff --git a/pandas/core/reshape/concat.py b/pandas/core/reshape/concat.py index 3c1b2b1eb11d2..3efe8072d3323 100644 --- a/pandas/core/reshape/concat.py +++ b/pandas/core/reshape/concat.py @@ -13,11 +13,11 @@ ) import pandas.core.common as com from pandas.core.generic import NDFrame -from pandas.core.index import ( - _all_indexes_same, - _get_consensus_names, - _get_objs_combined_axis, +from pandas.core.indexes.api import ( + all_indexes_same, ensure_index, + get_consensus_names, + get_objs_combined_axis, ) import pandas.core.indexes.base as ibase from pandas.core.internals import concatenate_block_managers @@ -523,7 +523,7 @@ def _get_new_axes(self): def _get_comb_axis(self, i): data_axis = self.objs[0]._get_block_manager_axis(i) try: - return _get_objs_combined_axis( + return get_objs_combined_axis( self.objs, axis=data_axis, intersect=self.intersect, sort=self.sort ) except IndexError: @@ -617,7 +617,7 @@ def _make_concat_multiindex(indexes, keys, levels=None, names=None) -> MultiInde else: levels = [ensure_index(x) for x in levels] - if not _all_indexes_same(indexes): + if not all_indexes_same(indexes): codes_list = [] # things are potentially different sizes, so compute the exact codes @@ -660,7 +660,7 @@ def _make_concat_multiindex(indexes, keys, levels=None, names=None) -> MultiInde ) # also copies - names = names + _get_consensus_names(indexes) + names = names + get_consensus_names(indexes) return MultiIndex( levels=levels, codes=codes_list, names=names, verify_integrity=False diff --git a/pandas/core/reshape/pivot.py b/pandas/core/reshape/pivot.py index ede33c5bd0258..9ac27b0450bbe 100644 --- a/pandas/core/reshape/pivot.py +++ b/pandas/core/reshape/pivot.py @@ -9,7 +9,7 @@ import pandas.core.common as com from pandas.core.frame import _shared_docs from pandas.core.groupby import Grouper -from pandas.core.index import Index, MultiIndex, _get_objs_combined_axis +from pandas.core.index import Index, MultiIndex, get_objs_combined_axis from pandas.core.reshape.concat import concat from pandas.core.reshape.util import cartesian_product from pandas.core.series import Series @@ -541,7 +541,7 @@ def crosstab( rownames = _get_names(index, rownames, prefix="row") colnames = _get_names(columns, colnames, prefix="col") - common_idx = _get_objs_combined_axis(index + columns, intersect=True, sort=False) + common_idx = get_objs_combined_axis(index + columns, intersect=True, sort=False) data = {} data.update(zip(rownames, index)) diff --git a/pandas/tests/indexes/test_base.py b/pandas/tests/indexes/test_base.py index 90f8fbc5faef2..8ffceb491aa86 100644 --- a/pandas/tests/indexes/test_base.py +++ b/pandas/tests/indexes/test_base.py @@ -33,12 +33,8 @@ period_range, ) from pandas.core.algorithms import safe_sort -from pandas.core.index import ( - _get_combined_index, - ensure_index, - ensure_index_from_sequences, -) -from pandas.core.indexes.api import Index, MultiIndex +from pandas.core.index import ensure_index, ensure_index_from_sequences +from pandas.core.indexes.api import Index, MultiIndex, _get_combined_index from pandas.tests.indexes.common import Base from pandas.tests.indexes.conftest import indices_dict import pandas.util.testing as tm From 207ab743e4c665e81ada72065ec5b4cd9c95c7d0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Srinivas=20Reddy=20Thatiparthy=20=28=E0=B0=B6=E0=B1=8D?= =?UTF-8?q?=E0=B0=B0=E0=B1=80=E0=B0=A8=E0=B0=BF=E0=B0=B5=E0=B0=BE=E0=B0=B8?= =?UTF-8?q?=E0=B1=8D=20=20=E0=B0=B0=E0=B1=86=E0=B0=A1=E0=B1=8D=E0=B0=A1?= =?UTF-8?q?=E0=B0=BF=20=E0=B0=A4=E0=B0=BE=E0=B0=9F=E0=B0=BF=E0=B0=AA?= =?UTF-8?q?=E0=B0=B0=E0=B1=8D=E0=B0=A4=E0=B0=BF=29?= Date: Fri, 15 Nov 2019 20:43:01 +0530 Subject: [PATCH 064/185] CLN: Update .format(...) strings to f-expressions (#29571) --- asv_bench/benchmarks/categoricals.py | 10 +++++----- asv_bench/benchmarks/gil.py | 6 ++---- asv_bench/benchmarks/index_object.py | 2 +- asv_bench/benchmarks/io/csv.py | 10 ++++------ asv_bench/benchmarks/io/excel.py | 2 +- asv_bench/benchmarks/io/hdf.py | 2 +- asv_bench/benchmarks/io/json.py | 4 ++-- asv_bench/benchmarks/io/msgpack.py | 2 +- asv_bench/benchmarks/io/pickle.py | 2 +- asv_bench/benchmarks/io/sql.py | 4 ++-- asv_bench/benchmarks/io/stata.py | 4 ++-- asv_bench/benchmarks/timedelta.py | 6 +++--- pandas/core/groupby/grouper.py | 6 +++++- pandas/io/common.py | 4 ++-- 14 files changed, 32 insertions(+), 32 deletions(-) diff --git a/asv_bench/benchmarks/categoricals.py b/asv_bench/benchmarks/categoricals.py index 4384ccb7fa8b3..e21d859d18c8c 100644 --- a/asv_bench/benchmarks/categoricals.py +++ b/asv_bench/benchmarks/categoricals.py @@ -84,7 +84,7 @@ class ValueCounts: def setup(self, dropna): n = 5 * 10 ** 5 - arr = ["s{:04d}".format(i) for i in np.random.randint(0, n // 10, size=n)] + arr = [f"s{i:04d}" for i in np.random.randint(0, n // 10, size=n)] self.ts = pd.Series(arr).astype("category") def time_value_counts(self, dropna): @@ -102,7 +102,7 @@ def time_rendering(self): class SetCategories: def setup(self): n = 5 * 10 ** 5 - arr = ["s{:04d}".format(i) for i in np.random.randint(0, n // 10, size=n)] + arr = [f"s{i:04d}" for i in np.random.randint(0, n // 10, size=n)] self.ts = pd.Series(arr).astype("category") def time_set_categories(self): @@ -112,7 +112,7 @@ def time_set_categories(self): class RemoveCategories: def setup(self): n = 5 * 10 ** 5 - arr = ["s{:04d}".format(i) for i in np.random.randint(0, n // 10, size=n)] + arr = [f"s{i:04d}" for i in np.random.randint(0, n // 10, size=n)] self.ts = pd.Series(arr).astype("category") def time_remove_categories(self): @@ -166,7 +166,7 @@ def setup(self, dtype): sample_size = 100 arr = [i for i in np.random.randint(0, n // 10, size=n)] if dtype == "object": - arr = ["s{:04d}".format(i) for i in arr] + arr = [f"s{i:04d}" for i in arr] self.sample = np.random.choice(arr, sample_size) self.series = pd.Series(arr).astype("category") @@ -225,7 +225,7 @@ def setup(self, index): elif index == "non_monotonic": self.data = pd.Categorical.from_codes([0, 1, 2] * N, categories=categories) else: - raise ValueError("Invalid index param: {}".format(index)) + raise ValueError(f"Invalid index param: {index}") self.scalar = 10000 self.list = list(range(10000)) diff --git a/asv_bench/benchmarks/gil.py b/asv_bench/benchmarks/gil.py index 43c8594b8c8df..860c6cc6192bb 100644 --- a/asv_bench/benchmarks/gil.py +++ b/asv_bench/benchmarks/gil.py @@ -250,13 +250,11 @@ def setup(self, dtype): np.random.randn(rows, cols), index=date_range("1/1/2000", periods=rows) ), "object": DataFrame( - "foo", - index=range(rows), - columns=["object%03d".format(i) for i in range(5)], + "foo", index=range(rows), columns=["object%03d" for _ in range(5)] ), } - self.fname = "__test_{}__.csv".format(dtype) + self.fname = f"__test_{dtype}__.csv" df = data[dtype] df.to_csv(self.fname) diff --git a/asv_bench/benchmarks/index_object.py b/asv_bench/benchmarks/index_object.py index a94960d494707..f1d5209ac65ef 100644 --- a/asv_bench/benchmarks/index_object.py +++ b/asv_bench/benchmarks/index_object.py @@ -146,7 +146,7 @@ class Indexing: def setup(self, dtype): N = 10 ** 6 - self.idx = getattr(tm, "make{}Index".format(dtype))(N) + self.idx = getattr(tm, f"make{dtype}Index")(N) self.array_mask = (np.arange(N) % 3) == 0 self.series_mask = Series(self.array_mask) self.sorted = self.idx.sort_values() diff --git a/asv_bench/benchmarks/io/csv.py b/asv_bench/benchmarks/io/csv.py index 9b8599b0a1b64..adb3dd95e3574 100644 --- a/asv_bench/benchmarks/io/csv.py +++ b/asv_bench/benchmarks/io/csv.py @@ -202,7 +202,7 @@ def setup(self, sep, thousands): data = np.random.randn(N, K) * np.random.randint(100, 10000, (N, K)) df = DataFrame(data) if thousands is not None: - fmt = ":{}".format(thousands) + fmt = f":{thousands}" fmt = "{" + fmt + "}" df = df.applymap(lambda x: fmt.format(x)) df.to_csv(self.fname, sep=sep) @@ -231,7 +231,7 @@ def setup(self, sep, decimal, float_precision): floats = [ "".join(random.choice(string.digits) for _ in range(28)) for _ in range(15) ] - rows = sep.join(["0{}".format(decimal) + "{}"] * 3) + "\n" + rows = sep.join([f"0{decimal}" + "{}"] * 3) + "\n" data = rows * 5 data = data.format(*floats) * 200 # 1000 x 3 strings csv self.StringIO_input = StringIO(data) @@ -309,9 +309,7 @@ class ReadCSVCachedParseDates(StringIORewind): param_names = ["do_cache"] def setup(self, do_cache): - data = ( - "\n".join("10/{}".format(year) for year in range(2000, 2100)) + "\n" - ) * 10 + data = ("\n".join(f"10/{year}" for year in range(2000, 2100)) + "\n") * 10 self.StringIO_input = StringIO(data) def time_read_csv_cached(self, do_cache): @@ -336,7 +334,7 @@ class ReadCSVMemoryGrowth(BaseIO): def setup(self): with open(self.fname, "w") as f: for i in range(self.num_rows): - f.write("{i}\n".format(i=i)) + f.write(f"{i}\n") def mem_parser_chunks(self): # see gh-24805. diff --git a/asv_bench/benchmarks/io/excel.py b/asv_bench/benchmarks/io/excel.py index c97cf768e27d9..75d87140488e3 100644 --- a/asv_bench/benchmarks/io/excel.py +++ b/asv_bench/benchmarks/io/excel.py @@ -14,7 +14,7 @@ def _generate_dataframe(): C = 5 df = DataFrame( np.random.randn(N, C), - columns=["float{}".format(i) for i in range(C)], + columns=[f"float{i}" for i in range(C)], index=date_range("20000101", periods=N, freq="H"), ) df["object"] = tm.makeStringIndex(N) diff --git a/asv_bench/benchmarks/io/hdf.py b/asv_bench/benchmarks/io/hdf.py index b78dc63d17130..88c1a3dc48ea4 100644 --- a/asv_bench/benchmarks/io/hdf.py +++ b/asv_bench/benchmarks/io/hdf.py @@ -115,7 +115,7 @@ def setup(self, format): C = 5 self.df = DataFrame( np.random.randn(N, C), - columns=["float{}".format(i) for i in range(C)], + columns=[f"float{i}" for i in range(C)], index=date_range("20000101", periods=N, freq="H"), ) self.df["object"] = tm.makeStringIndex(N) diff --git a/asv_bench/benchmarks/io/json.py b/asv_bench/benchmarks/io/json.py index 5c1d39776b91c..8f037e94e0095 100644 --- a/asv_bench/benchmarks/io/json.py +++ b/asv_bench/benchmarks/io/json.py @@ -20,7 +20,7 @@ def setup(self, orient, index): } df = DataFrame( np.random.randn(N, 5), - columns=["float_{}".format(i) for i in range(5)], + columns=[f"float_{i}" for i in range(5)], index=indexes[index], ) df.to_json(self.fname, orient=orient) @@ -43,7 +43,7 @@ def setup(self, index): } df = DataFrame( np.random.randn(N, 5), - columns=["float_{}".format(i) for i in range(5)], + columns=[f"float_{i}" for i in range(5)], index=indexes[index], ) df.to_json(self.fname, orient="records", lines=True) diff --git a/asv_bench/benchmarks/io/msgpack.py b/asv_bench/benchmarks/io/msgpack.py index f5038602539ab..a5b8b81bed85b 100644 --- a/asv_bench/benchmarks/io/msgpack.py +++ b/asv_bench/benchmarks/io/msgpack.py @@ -15,7 +15,7 @@ def setup(self): C = 5 self.df = DataFrame( np.random.randn(N, C), - columns=["float{}".format(i) for i in range(C)], + columns=[f"float{i}" for i in range(C)], index=date_range("20000101", periods=N, freq="H"), ) self.df["object"] = tm.makeStringIndex(N) diff --git a/asv_bench/benchmarks/io/pickle.py b/asv_bench/benchmarks/io/pickle.py index 647e9d27dec9d..12620656dd2bf 100644 --- a/asv_bench/benchmarks/io/pickle.py +++ b/asv_bench/benchmarks/io/pickle.py @@ -13,7 +13,7 @@ def setup(self): C = 5 self.df = DataFrame( np.random.randn(N, C), - columns=["float{}".format(i) for i in range(C)], + columns=[f"float{i}" for i in range(C)], index=date_range("20000101", periods=N, freq="H"), ) self.df["object"] = tm.makeStringIndex(N) diff --git a/asv_bench/benchmarks/io/sql.py b/asv_bench/benchmarks/io/sql.py index fe84c869717e3..6cc7f56ae3d65 100644 --- a/asv_bench/benchmarks/io/sql.py +++ b/asv_bench/benchmarks/io/sql.py @@ -19,7 +19,7 @@ def setup(self, connection): "sqlite": sqlite3.connect(":memory:"), } self.table_name = "test_type" - self.query_all = "SELECT * FROM {}".format(self.table_name) + self.query_all = f"SELECT * FROM {self.table_name}" self.con = con[connection] self.df = DataFrame( { @@ -58,7 +58,7 @@ def setup(self, connection, dtype): "sqlite": sqlite3.connect(":memory:"), } self.table_name = "test_type" - self.query_col = "SELECT {} FROM {}".format(dtype, self.table_name) + self.query_col = f"SELECT {dtype} FROM {self.table_name}" self.con = con[connection] self.df = DataFrame( { diff --git a/asv_bench/benchmarks/io/stata.py b/asv_bench/benchmarks/io/stata.py index 28829785d72e9..f3125f8598418 100644 --- a/asv_bench/benchmarks/io/stata.py +++ b/asv_bench/benchmarks/io/stata.py @@ -17,7 +17,7 @@ def setup(self, convert_dates): C = self.C = 5 self.df = DataFrame( np.random.randn(N, C), - columns=["float{}".format(i) for i in range(C)], + columns=[f"float{i}" for i in range(C)], index=date_range("20000101", periods=N, freq="H"), ) self.df["object"] = tm.makeStringIndex(self.N) @@ -47,7 +47,7 @@ def setup(self, convert_dates): for i in range(10): missing_data = np.random.randn(self.N) missing_data[missing_data < 0] = np.nan - self.df["missing_{0}".format(i)] = missing_data + self.df[f"missing_{i}"] = missing_data self.df.to_stata(self.fname, self.convert_dates) diff --git a/asv_bench/benchmarks/timedelta.py b/asv_bench/benchmarks/timedelta.py index 828134b80aa3d..37418d752f833 100644 --- a/asv_bench/benchmarks/timedelta.py +++ b/asv_bench/benchmarks/timedelta.py @@ -14,8 +14,8 @@ def setup(self): self.str_days = [] self.str_seconds = [] for i in self.ints: - self.str_days.append("{0} days".format(i)) - self.str_seconds.append("00:00:{0:02d}".format(i)) + self.str_days.append(f"{i} days") + self.str_seconds.append(f"00:00:{i:02d}") def time_convert_int(self): to_timedelta(self.ints, unit="s") @@ -34,7 +34,7 @@ class ToTimedeltaErrors: def setup(self, errors): ints = np.random.randint(0, 60, size=10000) - self.arr = ["{0} days".format(i) for i in ints] + self.arr = [f"{i} days" for i in ints] self.arr[-1] = "apple" def time_convert(self, errors): diff --git a/pandas/core/groupby/grouper.py b/pandas/core/groupby/grouper.py index e6e3ee62459ca..0edc3e4a4ff3d 100644 --- a/pandas/core/groupby/grouper.py +++ b/pandas/core/groupby/grouper.py @@ -288,7 +288,11 @@ def __init__( if self.name is None: self.name = index.names[level] - self.grouper, self._codes, self._group_index = index._get_grouper_for_level( # noqa: E501 + ( + self.grouper, + self._codes, + self._group_index, + ) = index._get_grouper_for_level( # noqa: E501 self.grouper, level ) diff --git a/pandas/io/common.py b/pandas/io/common.py index 9b9fe21b56989..bd3808cf37b6b 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -109,7 +109,7 @@ def _is_url(url) -> bool: def _expand_user( - filepath_or_buffer: FilePathOrBuffer[AnyStr] + filepath_or_buffer: FilePathOrBuffer[AnyStr], ) -> FilePathOrBuffer[AnyStr]: """Return the argument with an initial component of ~ or ~user replaced by that user's home directory. @@ -139,7 +139,7 @@ def _validate_header_arg(header) -> None: def _stringify_path( - filepath_or_buffer: FilePathOrBuffer[AnyStr] + filepath_or_buffer: FilePathOrBuffer[AnyStr], ) -> FilePathOrBuffer[AnyStr]: """Attempt to convert a path-like object to a string. From 43132670f2a6691bddee690ab12716a0f83d515c Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Fri, 15 Nov 2019 09:25:56 -0600 Subject: [PATCH 065/185] CI: Bump s3fs (#29573) --- ci/deps/travis-36-cov.yaml | 2 +- ci/deps/travis-36-locale.yaml | 2 +- ci/deps/travis-36-slow.yaml | 2 +- ci/deps/travis-37.yaml | 2 +- doc/source/getting_started/install.rst | 2 +- doc/source/whatsnew/v1.0.0.rst | 1 + pandas/compat/_optional.py | 2 +- pandas/tests/io/conftest.py | 3 ++- pandas/tests/io/parser/test_network.py | 12 ++++++++++-- 9 files changed, 19 insertions(+), 9 deletions(-) diff --git a/ci/deps/travis-36-cov.yaml b/ci/deps/travis-36-cov.yaml index e4e917d13990c..b2a74fceaf0fa 100644 --- a/ci/deps/travis-36-cov.yaml +++ b/ci/deps/travis-36-cov.yaml @@ -29,7 +29,7 @@ dependencies: - python-snappy - python=3.6.* - pytz - - s3fs<0.3 + - s3fs - scikit-learn - scipy - sqlalchemy diff --git a/ci/deps/travis-36-locale.yaml b/ci/deps/travis-36-locale.yaml index 44795766d7c31..09f72e65098c9 100644 --- a/ci/deps/travis-36-locale.yaml +++ b/ci/deps/travis-36-locale.yaml @@ -26,7 +26,7 @@ dependencies: - python-dateutil - python=3.6.* - pytz - - s3fs=0.0.8 + - s3fs=0.3.0 - scipy - sqlalchemy=1.1.4 - xarray=0.10 diff --git a/ci/deps/travis-36-slow.yaml b/ci/deps/travis-36-slow.yaml index d54708d48a65e..e9c5dadbc924a 100644 --- a/ci/deps/travis-36-slow.yaml +++ b/ci/deps/travis-36-slow.yaml @@ -18,7 +18,7 @@ dependencies: - python-dateutil - python=3.6.* - pytz - - s3fs<0.3 + - s3fs - scipy - sqlalchemy - xlrd diff --git a/ci/deps/travis-37.yaml b/ci/deps/travis-37.yaml index 440ca6c480b87..903636f2fe060 100644 --- a/ci/deps/travis-37.yaml +++ b/ci/deps/travis-37.yaml @@ -17,7 +17,7 @@ dependencies: - pytest-xdist>=1.29.0 - pytest-mock - hypothesis>=3.58.0 - - s3fs<0.3 + - s3fs - pip - pyreadstat - pip: diff --git a/doc/source/getting_started/install.rst b/doc/source/getting_started/install.rst index 1ca1640f9a7c6..663948fd46cf6 100644 --- a/doc/source/getting_started/install.rst +++ b/doc/source/getting_started/install.rst @@ -263,7 +263,7 @@ pymysql 0.7.11 MySQL engine for sqlalchemy pyreadstat SPSS files (.sav) reading pytables 3.4.2 HDF5 reading / writing qtpy Clipboard I/O -s3fs 0.0.8 Amazon S3 access +s3fs 0.3.0 Amazon S3 access xarray 0.8.2 pandas-like API for N-dimensional data xclip Clipboard I/O on linux xlrd 1.1.0 Excel reading diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index 3b7756256dcab..896ae91c68642 100644 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -183,6 +183,7 @@ Backwards incompatible API changes Other API changes ^^^^^^^^^^^^^^^^^ +- Bumpded the minimum supported version of ``s3fs`` from 0.0.8 to 0.3.0 (:issue:`28616`) - :class:`pandas.core.groupby.GroupBy.transform` now raises on invalid operation names (:issue:`27489`) - :meth:`pandas.api.types.infer_dtype` will now return "integer-na" for integer and ``np.nan`` mix (:issue:`27283`) - :meth:`MultiIndex.from_arrays` will no longer infer names from arrays if ``names=None`` is explicitly provided (:issue:`27292`) diff --git a/pandas/compat/_optional.py b/pandas/compat/_optional.py index cd4e1b7e8aa4d..14425578786d7 100644 --- a/pandas/compat/_optional.py +++ b/pandas/compat/_optional.py @@ -18,7 +18,7 @@ "pandas_gbq": "0.8.0", "pyarrow": "0.9.0", "pytables": "3.4.2", - "s3fs": "0.0.8", + "s3fs": "0.3.0", "scipy": "0.19.0", "sqlalchemy": "1.1.4", "tables": "3.4.2", diff --git a/pandas/tests/io/conftest.py b/pandas/tests/io/conftest.py index 7b6b9b6380a36..3f034107ef24f 100644 --- a/pandas/tests/io/conftest.py +++ b/pandas/tests/io/conftest.py @@ -40,7 +40,7 @@ def s3_resource(tips_file, jsonl_file): A private bucket "cant_get_it" is also created. The boto3 s3 resource is yielded by the fixture. """ - pytest.importorskip("s3fs") + s3fs = pytest.importorskip("s3fs") boto3 = pytest.importorskip("boto3") with tm.ensure_safe_environment_variables(): @@ -77,6 +77,7 @@ def add_tips_files(bucket_name): conn.create_bucket(Bucket="cant_get_it", ACL="private") add_tips_files("cant_get_it") + s3fs.S3FileSystem.clear_instance_cache() yield conn finally: s3.stop() diff --git a/pandas/tests/io/parser/test_network.py b/pandas/tests/io/parser/test_network.py index dbe721b10a3ce..57e2950b06ce8 100644 --- a/pandas/tests/io/parser/test_network.py +++ b/pandas/tests/io/parser/test_network.py @@ -166,7 +166,7 @@ def test_s3_fails(self): # Receive a permission error when trying to read a private bucket. # It's irrelevant here that this isn't actually a table. with pytest.raises(IOError): - read_csv("s3://cant_get_it/") + read_csv("s3://cant_get_it/file.csv") def test_read_csv_handles_boto_s3_object(self, s3_resource, tips_file): # see gh-16135 @@ -184,6 +184,8 @@ def test_read_csv_handles_boto_s3_object(self, s3_resource, tips_file): def test_read_csv_chunked_download(self, s3_resource, caplog): # 8 MB, S3FS usees 5MB chunks + import s3fs + df = DataFrame(np.random.randn(100000, 4), columns=list("abcd")) buf = BytesIO() str_buf = StringIO() @@ -194,7 +196,13 @@ def test_read_csv_chunked_download(self, s3_resource, caplog): s3_resource.Bucket("pandas-test").put_object(Key="large-file.csv", Body=buf) - with caplog.at_level(logging.DEBUG, logger="s3fs.core"): + # Possibly some state leaking in between tests. + # If we don't clear this cache, we saw `GetObject operation: Forbidden`. + # Presumably the s3fs instance is being cached, with the directory listing + # from *before* we add the large-file.csv in the pandas-test bucket. + s3fs.S3FileSystem.clear_instance_cache() + + with caplog.at_level(logging.DEBUG, logger="s3fs"): read_csv("s3://pandas-test/large-file.csv", nrows=5) # log of fetch_range (start, stop) assert (0, 5505024) in {x.args[-2:] for x in caplog.records} From 57490b1ea6ee054224253e24bca4fc7523097972 Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Sat, 16 Nov 2019 01:24:04 +0000 Subject: [PATCH 066/185] CI: fix flake C413, C414, C416 errors (#29646) --- .pre-commit-config.yaml | 2 +- asv_bench/benchmarks/categoricals.py | 2 +- asv_bench/benchmarks/frame_ctor.py | 2 +- doc/source/whatsnew/v0.24.0.rst | 2 +- environment.yml | 2 +- pandas/core/arrays/categorical.py | 2 +- pandas/core/arrays/sparse/scipy_sparse.py | 2 +- pandas/core/frame.py | 4 +- pandas/core/indexes/multi.py | 6 +-- pandas/core/internals/managers.py | 2 +- pandas/core/tools/datetimes.py | 4 +- pandas/io/parsers.py | 4 +- pandas/io/pytables.py | 2 +- pandas/tests/api/test_api.py | 2 +- pandas/tests/frame/test_alter_axes.py | 5 +-- pandas/tests/frame/test_analytics.py | 2 +- pandas/tests/frame/test_reshape.py | 2 +- pandas/tests/groupby/test_function.py | 4 +- .../tests/indexes/timedeltas/test_setops.py | 6 +-- pandas/tests/indexing/multiindex/test_loc.py | 10 +---- pandas/tests/indexing/multiindex/test_xs.py | 8 +--- pandas/tests/indexing/test_categorical.py | 2 +- pandas/tests/io/formats/test_to_excel.py | 2 +- pandas/tests/io/pytables/test_store.py | 41 +++++++++---------- pandas/tests/io/sas/test_xport.py | 2 +- pandas/tests/io/test_parquet.py | 2 +- pandas/tests/plotting/test_misc.py | 5 +-- pandas/tests/reshape/merge/test_join.py | 2 +- pandas/tests/reshape/merge/test_merge.py | 2 +- pandas/tests/series/test_alter_axes.py | 2 +- pandas/tests/series/test_api.py | 4 +- pandas/tests/series/test_datetime_values.py | 12 ++---- pandas/tests/series/test_duplicates.py | 2 +- pandas/tests/series/test_timeseries.py | 2 +- pandas/tests/test_base.py | 2 +- pandas/tests/test_multilevel.py | 2 +- pandas/tests/tseries/offsets/test_offsets.py | 6 +-- pandas/util/testing.py | 2 +- requirements-dev.txt | 2 +- 39 files changed, 75 insertions(+), 94 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 3f98273a336cf..3bed68fd8d2fc 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -9,7 +9,7 @@ repos: hooks: - id: flake8 language: python_venv - additional_dependencies: [flake8-comprehensions] + additional_dependencies: [flake8-comprehensions>=3.1.0] - repo: https://github.com/pre-commit/mirrors-isort rev: v4.3.20 hooks: diff --git a/asv_bench/benchmarks/categoricals.py b/asv_bench/benchmarks/categoricals.py index e21d859d18c8c..a299e688a13ed 100644 --- a/asv_bench/benchmarks/categoricals.py +++ b/asv_bench/benchmarks/categoricals.py @@ -164,7 +164,7 @@ def setup(self, dtype): np.random.seed(1234) n = 5 * 10 ** 5 sample_size = 100 - arr = [i for i in np.random.randint(0, n // 10, size=n)] + arr = list(np.random.randint(0, n // 10, size=n)) if dtype == "object": arr = [f"s{i:04d}" for i in arr] self.sample = np.random.choice(arr, sample_size) diff --git a/asv_bench/benchmarks/frame_ctor.py b/asv_bench/benchmarks/frame_ctor.py index 3944e0bc523d8..a949ffdced576 100644 --- a/asv_bench/benchmarks/frame_ctor.py +++ b/asv_bench/benchmarks/frame_ctor.py @@ -99,7 +99,7 @@ class FromLists: def setup(self): N = 1000 M = 100 - self.data = [[j for j in range(M)] for i in range(N)] + self.data = [list(range(M)) for i in range(N)] def time_frame_from_lists(self): self.df = DataFrame(self.data) diff --git a/doc/source/whatsnew/v0.24.0.rst b/doc/source/whatsnew/v0.24.0.rst index 42579becd4237..85de0150a5a28 100644 --- a/doc/source/whatsnew/v0.24.0.rst +++ b/doc/source/whatsnew/v0.24.0.rst @@ -353,7 +353,7 @@ Example: mi = pd.MultiIndex.from_product([list('AB'), list('CD'), list('EF')], names=['AB', 'CD', 'EF']) - df = pd.DataFrame([i for i in range(len(mi))], index=mi, columns=['N']) + df = pd.DataFrame(list(range(len(mi))), index=mi, columns=['N']) df df.rename_axis(index={'CD': 'New'}) diff --git a/environment.yml b/environment.yml index e9ac76f5bc52c..a3582c56ee9d2 100644 --- a/environment.yml +++ b/environment.yml @@ -18,7 +18,7 @@ dependencies: - black<=19.3b0 - cpplint - flake8 - - flake8-comprehensions # used by flake8, linting of unnecessary comprehensions + - flake8-comprehensions>=3.1.0 # used by flake8, linting of unnecessary comprehensions - flake8-rst>=0.6.0,<=0.7.0 # linting of code blocks in rst files - isort # check that imports are in the right order - mypy=0.720 diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 817972b3356a2..53689b6bc2eba 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -1110,7 +1110,7 @@ def remove_categories(self, removals, inplace=False): if not is_list_like(removals): removals = [removals] - removal_set = set(list(removals)) + removal_set = set(removals) not_included = removal_set - set(self.dtype.categories) new_categories = [c for c in self.dtype.categories if c not in removal_set] diff --git a/pandas/core/arrays/sparse/scipy_sparse.py b/pandas/core/arrays/sparse/scipy_sparse.py index 11c27451a5801..6ae2903d9826c 100644 --- a/pandas/core/arrays/sparse/scipy_sparse.py +++ b/pandas/core/arrays/sparse/scipy_sparse.py @@ -51,7 +51,7 @@ def _get_label_to_i_dict(labels, sort_labels=False): """ labels = Index(map(tuple, labels)).unique().tolist() # squish if sort_labels: - labels = sorted(list(labels)) + labels = sorted(labels) d = OrderedDict((k, i) for i, k in enumerate(labels)) return d diff --git a/pandas/core/frame.py b/pandas/core/frame.py index ca421e9695888..9c9189e5f8316 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -4281,7 +4281,7 @@ def set_index( arrays = [] names = [] if append: - names = [x for x in self.index.names] + names = list(self.index.names) if isinstance(self.index, ABCMultiIndex): for i in range(self.index.nlevels): arrays.append(self.index._get_level_values(i)) @@ -7268,7 +7268,7 @@ def _series_round(s, decimals): if isinstance(decimals, Series): if not decimals.index.is_unique: raise ValueError("Index of decimals must be unique") - new_cols = [col for col in _dict_round(self, decimals)] + new_cols = list(_dict_round(self, decimals)) elif is_integer(decimals): # Dispatch to Series.round new_cols = [_series_round(v, decimals) for _, v in self.items()] diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index d8f9db06c5e8c..7b02a99263266 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -368,7 +368,7 @@ def _verify_integrity(self, codes=None, levels=None): if not level.is_unique: raise ValueError( "Level values must be unique: {values} on " - "level {level}".format(values=[value for value in level], level=i) + "level {level}".format(values=list(level), level=i) ) if self.sortorder is not None: if self.sortorder > self._lexsort_depth(): @@ -1992,8 +1992,8 @@ def levshape(self): def __reduce__(self): """Necessary for making this object picklable""" d = dict( - levels=[lev for lev in self.levels], - codes=[level_codes for level_codes in self.codes], + levels=list(self.levels), + codes=list(self.codes), sortorder=self.sortorder, names=list(self.names), ) diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index 96fd4c6bdc6e5..d92167f8a3b19 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -260,7 +260,7 @@ def get_ftypes(self): def __getstate__(self): block_values = [b.values for b in self.blocks] block_items = [self.items[b.mgr_locs.indexer] for b in self.blocks] - axes_array = [ax for ax in self.axes] + axes_array = list(self.axes) extra_state = { "0.14.1": { diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py index bb8d15896b727..453d1cca2e085 100644 --- a/pandas/core/tools/datetimes.py +++ b/pandas/core/tools/datetimes.py @@ -857,7 +857,7 @@ def f(value): # we require at least Ymd required = ["year", "month", "day"] - req = sorted(list(set(required) - set(unit_rev.keys()))) + req = sorted(set(required) - set(unit_rev.keys())) if len(req): raise ValueError( "to assemble mappings requires at least that " @@ -866,7 +866,7 @@ def f(value): ) # keys we don't recognize - excess = sorted(list(set(unit_rev.keys()) - set(_unit_map.values()))) + excess = sorted(set(unit_rev.keys()) - set(_unit_map.values())) if len(excess): raise ValueError( "extra keys have been passed " diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index dba96358227c3..d9e505f0b30cd 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -1605,7 +1605,7 @@ def ix(col): # remove index items from content and columns, don't pop in # loop - for i in reversed(sorted(to_remove)): + for i in sorted(to_remove, reverse=True): data.pop(i) if not self._implicit_index: columns.pop(i) @@ -1637,7 +1637,7 @@ def _get_name(icol): # remove index items from content and columns, don't pop in # loop - for c in reversed(sorted(to_remove)): + for c in sorted(to_remove, reverse=True): data.pop(c) col_names.remove(c) diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index 03eb8570e436e..7c7b78720d46d 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -1341,7 +1341,7 @@ def info(self) -> str: type=type(self), path=pprint_thing(self._path) ) if self.is_open: - lkeys = sorted(list(self.keys())) + lkeys = sorted(self.keys()) if len(lkeys): keys = [] values = [] diff --git a/pandas/tests/api/test_api.py b/pandas/tests/api/test_api.py index 3a8e263ac2a6d..601fde80e9a94 100644 --- a/pandas/tests/api/test_api.py +++ b/pandas/tests/api/test_api.py @@ -13,7 +13,7 @@ def check(self, namespace, expected, ignored=None): result = sorted(f for f in dir(namespace) if not f.startswith("__")) if ignored is not None: - result = sorted(list(set(result) - set(ignored))) + result = sorted(set(result) - set(ignored)) expected = sorted(expected) tm.assert_almost_equal(result, expected) diff --git a/pandas/tests/frame/test_alter_axes.py b/pandas/tests/frame/test_alter_axes.py index 9b76be18b0e88..21470151dcfbd 100644 --- a/pandas/tests/frame/test_alter_axes.py +++ b/pandas/tests/frame/test_alter_axes.py @@ -381,7 +381,7 @@ def test_set_index_custom_label_hashable_iterable(self): class Thing(frozenset): # need to stabilize repr for KeyError (due to random order in sets) def __repr__(self) -> str: - tmp = sorted(list(self)) + tmp = sorted(self) # double curly brace prints one brace in format string return "frozenset({{{}}})".format(", ".join(map(repr, tmp))) @@ -745,8 +745,7 @@ def test_rename_axis_mapper(self): # GH 19978 mi = MultiIndex.from_product([["a", "b", "c"], [1, 2]], names=["ll", "nn"]) df = DataFrame( - {"x": [i for i in range(len(mi))], "y": [i * 10 for i in range(len(mi))]}, - index=mi, + {"x": list(range(len(mi))), "y": [i * 10 for i in range(len(mi))]}, index=mi ) # Test for rename of the Index object of columns diff --git a/pandas/tests/frame/test_analytics.py b/pandas/tests/frame/test_analytics.py index f694689fa9dfb..9cc9c5dc697b6 100644 --- a/pandas/tests/frame/test_analytics.py +++ b/pandas/tests/frame/test_analytics.py @@ -1244,7 +1244,7 @@ def test_mode_dropna(self, dropna, expected): } ) - result = df[sorted(list(expected.keys()))].mode(dropna=dropna) + result = df[sorted(expected.keys())].mode(dropna=dropna) expected = DataFrame(expected) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/frame/test_reshape.py b/pandas/tests/frame/test_reshape.py index 5d2c115ce8eb5..5acd681933914 100644 --- a/pandas/tests/frame/test_reshape.py +++ b/pandas/tests/frame/test_reshape.py @@ -699,7 +699,7 @@ def verify(df): for i, j in zip(rows, cols): left = sorted(df.iloc[i, j].split(".")) right = mk_list(df.index[i]) + mk_list(df.columns[j]) - right = sorted(list(map(cast, right))) + right = sorted(map(cast, right)) assert left == right df = DataFrame( diff --git a/pandas/tests/groupby/test_function.py b/pandas/tests/groupby/test_function.py index 18c4d7ceddc65..c41f762e9128d 100644 --- a/pandas/tests/groupby/test_function.py +++ b/pandas/tests/groupby/test_function.py @@ -1300,8 +1300,8 @@ def test_size_groupby_all_null(): ([np.nan, 4.0, np.nan, 2.0, np.nan], [np.nan, 4.0, np.nan, 2.0, np.nan]), # Timestamps ( - [x for x in pd.date_range("1/1/18", freq="D", periods=5)], - [x for x in pd.date_range("1/1/18", freq="D", periods=5)][::-1], + list(pd.date_range("1/1/18", freq="D", periods=5)), + list(pd.date_range("1/1/18", freq="D", periods=5))[::-1], ), # All NA ([np.nan] * 5, [np.nan] * 5), diff --git a/pandas/tests/indexes/timedeltas/test_setops.py b/pandas/tests/indexes/timedeltas/test_setops.py index 861067480b5fa..bbdd6c8c7c017 100644 --- a/pandas/tests/indexes/timedeltas/test_setops.py +++ b/pandas/tests/indexes/timedeltas/test_setops.py @@ -39,7 +39,7 @@ def test_union_bug_1730(self): rng_b = timedelta_range("1 day", periods=4, freq="4H") result = rng_a.union(rng_b) - exp = TimedeltaIndex(sorted(set(list(rng_a)) | set(list(rng_b)))) + exp = TimedeltaIndex(sorted(set(rng_a) | set(rng_b))) tm.assert_index_equal(result, exp) def test_union_bug_1745(self): @@ -50,7 +50,7 @@ def test_union_bug_1745(self): ) result = left.union(right) - exp = TimedeltaIndex(sorted(set(list(left)) | set(list(right)))) + exp = TimedeltaIndex(sorted(set(left) | set(right))) tm.assert_index_equal(result, exp) def test_union_bug_4564(self): @@ -59,7 +59,7 @@ def test_union_bug_4564(self): right = left + pd.offsets.Minute(15) result = left.union(right) - exp = TimedeltaIndex(sorted(set(list(left)) | set(list(right)))) + exp = TimedeltaIndex(sorted(set(left) | set(right))) tm.assert_index_equal(result, exp) def test_intersection_bug_1708(self): diff --git a/pandas/tests/indexing/multiindex/test_loc.py b/pandas/tests/indexing/multiindex/test_loc.py index 9eeee897bfbb5..76425c72ce4f9 100644 --- a/pandas/tests/indexing/multiindex/test_loc.py +++ b/pandas/tests/indexing/multiindex/test_loc.py @@ -1,5 +1,3 @@ -import itertools - import numpy as np import pytest @@ -223,17 +221,13 @@ def test_loc_getitem_int_slice(self): # GH 3053 # loc should treat integer slices like label slices - index = MultiIndex.from_tuples( - [t for t in itertools.product([6, 7, 8], ["a", "b"])] - ) + index = MultiIndex.from_product([[6, 7, 8], ["a", "b"]]) df = DataFrame(np.random.randn(6, 6), index, index) result = df.loc[6:8, :] expected = df tm.assert_frame_equal(result, expected) - index = MultiIndex.from_tuples( - [t for t in itertools.product([10, 20, 30], ["a", "b"])] - ) + index = MultiIndex.from_product([[10, 20, 30], ["a", "b"]]) df = DataFrame(np.random.randn(6, 6), index, index) result = df.loc[20:30, :] expected = df.iloc[2:] diff --git a/pandas/tests/indexing/multiindex/test_xs.py b/pandas/tests/indexing/multiindex/test_xs.py index 99f343c2f4a7d..c81712b1e0496 100644 --- a/pandas/tests/indexing/multiindex/test_xs.py +++ b/pandas/tests/indexing/multiindex/test_xs.py @@ -1,5 +1,3 @@ -from itertools import product - import numpy as np import pytest @@ -159,10 +157,8 @@ def test_xs_setting_with_copy_error_multiple(four_level_index_dataframe): def test_xs_integer_key(): # see gh-2107 dates = range(20111201, 20111205) - ids = "abcde" - index = MultiIndex.from_tuples( - [x for x in product(dates, ids)], names=["date", "secid"] - ) + ids = list("abcde") + index = MultiIndex.from_product([dates, ids], names=["date", "secid"]) df = DataFrame(np.random.randn(len(index), 3), index, ["X", "Y", "Z"]) result = df.xs(20111201, level="date") diff --git a/pandas/tests/indexing/test_categorical.py b/pandas/tests/indexing/test_categorical.py index 6c81a00cb8f34..ab3b0ed13b5c0 100644 --- a/pandas/tests/indexing/test_categorical.py +++ b/pandas/tests/indexing/test_categorical.py @@ -472,7 +472,7 @@ def test_getitem_with_listlike(self): [[1, 0], [0, 1]], dtype="uint8", index=[0, 1], columns=cats ) dummies = pd.get_dummies(cats) - result = dummies[[c for c in dummies.columns]] + result = dummies[list(dummies.columns)] tm.assert_frame_equal(result, expected) def test_setitem_listlike(self): diff --git a/pandas/tests/io/formats/test_to_excel.py b/pandas/tests/io/formats/test_to_excel.py index 1440b0a6f06f1..4d8edec7c7f14 100644 --- a/pandas/tests/io/formats/test_to_excel.py +++ b/pandas/tests/io/formats/test_to_excel.py @@ -262,7 +262,7 @@ def test_css_to_excel_inherited(css, inherited, expected): @pytest.mark.parametrize( "input_color,output_color", ( - [(name, rgb) for name, rgb in CSSToExcelConverter.NAMED_COLORS.items()] + list(CSSToExcelConverter.NAMED_COLORS.items()) + [("#" + rgb, rgb) for rgb in CSSToExcelConverter.NAMED_COLORS.values()] + [("#F0F", "FF00FF"), ("#ABC", "AABBCC")] ), diff --git a/pandas/tests/io/pytables/test_store.py b/pandas/tests/io/pytables/test_store.py index a43da75005a65..d79280f9ea494 100644 --- a/pandas/tests/io/pytables/test_store.py +++ b/pandas/tests/io/pytables/test_store.py @@ -2806,16 +2806,16 @@ def test_select_iterator(self, setup_path): expected = store.select("df") - results = [s for s in store.select("df", iterator=True)] + results = list(store.select("df", iterator=True)) result = concat(results) tm.assert_frame_equal(expected, result) - results = [s for s in store.select("df", chunksize=100)] + results = list(store.select("df", chunksize=100)) assert len(results) == 5 result = concat(results) tm.assert_frame_equal(expected, result) - results = [s for s in store.select("df", chunksize=150)] + results = list(store.select("df", chunksize=150)) result = concat(results) tm.assert_frame_equal(result, expected) @@ -2835,7 +2835,7 @@ def test_select_iterator(self, setup_path): df = tm.makeTimeDataFrame(500) df.to_hdf(path, "df", format="table") - results = [s for s in read_hdf(path, "df", chunksize=100)] + results = list(read_hdf(path, "df", chunksize=100)) result = concat(results) assert len(results) == 5 @@ -2856,12 +2856,9 @@ def test_select_iterator(self, setup_path): # full selection expected = store.select_as_multiple(["df1", "df2"], selector="df1") - results = [ - s - for s in store.select_as_multiple( - ["df1", "df2"], selector="df1", chunksize=150 - ) - ] + results = list( + store.select_as_multiple(["df1", "df2"], selector="df1", chunksize=150) + ) result = concat(results) tm.assert_frame_equal(expected, result) @@ -2916,19 +2913,19 @@ def test_select_iterator_complete_8014(self, setup_path): end_dt = expected.index[-1] # select w/iterator and no where clause works - results = [s for s in store.select("df", chunksize=chunksize)] + results = list(store.select("df", chunksize=chunksize)) result = concat(results) tm.assert_frame_equal(expected, result) # select w/iterator and where clause, single term, begin of range where = "index >= '{beg_dt}'".format(beg_dt=beg_dt) - results = [s for s in store.select("df", where=where, chunksize=chunksize)] + results = list(store.select("df", where=where, chunksize=chunksize)) result = concat(results) tm.assert_frame_equal(expected, result) # select w/iterator and where clause, single term, end of range where = "index <= '{end_dt}'".format(end_dt=end_dt) - results = [s for s in store.select("df", where=where, chunksize=chunksize)] + results = list(store.select("df", where=where, chunksize=chunksize)) result = concat(results) tm.assert_frame_equal(expected, result) @@ -2936,7 +2933,7 @@ def test_select_iterator_complete_8014(self, setup_path): where = "index >= '{beg_dt}' & index <= '{end_dt}'".format( beg_dt=beg_dt, end_dt=end_dt ) - results = [s for s in store.select("df", where=where, chunksize=chunksize)] + results = list(store.select("df", where=where, chunksize=chunksize)) result = concat(results) tm.assert_frame_equal(expected, result) @@ -2958,14 +2955,14 @@ def test_select_iterator_non_complete_8014(self, setup_path): # select w/iterator and where clause, single term, begin of range where = "index >= '{beg_dt}'".format(beg_dt=beg_dt) - results = [s for s in store.select("df", where=where, chunksize=chunksize)] + results = list(store.select("df", where=where, chunksize=chunksize)) result = concat(results) rexpected = expected[expected.index >= beg_dt] tm.assert_frame_equal(rexpected, result) # select w/iterator and where clause, single term, end of range where = "index <= '{end_dt}'".format(end_dt=end_dt) - results = [s for s in store.select("df", where=where, chunksize=chunksize)] + results = list(store.select("df", where=where, chunksize=chunksize)) result = concat(results) rexpected = expected[expected.index <= end_dt] tm.assert_frame_equal(rexpected, result) @@ -2974,7 +2971,7 @@ def test_select_iterator_non_complete_8014(self, setup_path): where = "index >= '{beg_dt}' & index <= '{end_dt}'".format( beg_dt=beg_dt, end_dt=end_dt ) - results = [s for s in store.select("df", where=where, chunksize=chunksize)] + results = list(store.select("df", where=where, chunksize=chunksize)) result = concat(results) rexpected = expected[ (expected.index >= beg_dt) & (expected.index <= end_dt) @@ -2992,7 +2989,7 @@ def test_select_iterator_non_complete_8014(self, setup_path): # select w/iterator and where clause, single term, begin of range where = "index > '{end_dt}'".format(end_dt=end_dt) - results = [s for s in store.select("df", where=where, chunksize=chunksize)] + results = list(store.select("df", where=where, chunksize=chunksize)) assert 0 == len(results) def test_select_iterator_many_empty_frames(self, setup_path): @@ -3014,14 +3011,14 @@ def test_select_iterator_many_empty_frames(self, setup_path): # select w/iterator and where clause, single term, begin of range where = "index >= '{beg_dt}'".format(beg_dt=beg_dt) - results = [s for s in store.select("df", where=where, chunksize=chunksize)] + results = list(store.select("df", where=where, chunksize=chunksize)) result = concat(results) rexpected = expected[expected.index >= beg_dt] tm.assert_frame_equal(rexpected, result) # select w/iterator and where clause, single term, end of range where = "index <= '{end_dt}'".format(end_dt=end_dt) - results = [s for s in store.select("df", where=where, chunksize=chunksize)] + results = list(store.select("df", where=where, chunksize=chunksize)) assert len(results) == 1 result = concat(results) @@ -3032,7 +3029,7 @@ def test_select_iterator_many_empty_frames(self, setup_path): where = "index >= '{beg_dt}' & index <= '{end_dt}'".format( beg_dt=beg_dt, end_dt=end_dt ) - results = [s for s in store.select("df", where=where, chunksize=chunksize)] + results = list(store.select("df", where=where, chunksize=chunksize)) # should be 1, is 10 assert len(results) == 1 @@ -3052,7 +3049,7 @@ def test_select_iterator_many_empty_frames(self, setup_path): where = "index <= '{beg_dt}' & index >= '{end_dt}'".format( beg_dt=beg_dt, end_dt=end_dt ) - results = [s for s in store.select("df", where=where, chunksize=chunksize)] + results = list(store.select("df", where=where, chunksize=chunksize)) # should be [] assert len(results) == 0 diff --git a/pandas/tests/io/sas/test_xport.py b/pandas/tests/io/sas/test_xport.py index 7893877be2033..a52b22122ba81 100644 --- a/pandas/tests/io/sas/test_xport.py +++ b/pandas/tests/io/sas/test_xport.py @@ -104,7 +104,7 @@ def test1_incremental(self): reader = read_sas(self.file01, index="SEQN", chunksize=1000) - all_data = [x for x in reader] + all_data = list(reader) data = pd.concat(all_data, axis=0) tm.assert_frame_equal(data, data_csv, check_index_type=False) diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py index debc797fe6e88..5dd671c659263 100644 --- a/pandas/tests/io/test_parquet.py +++ b/pandas/tests/io/test_parquet.py @@ -405,7 +405,7 @@ def test_write_ignoring_index(self, engine): ["one", "two", "one", "two", "one", "two", "one", "two"], ] df = pd.DataFrame( - {"one": [i for i in range(8)], "two": [-i for i in range(8)]}, index=arrays + {"one": list(range(8)), "two": [-i for i in range(8)]}, index=arrays ) expected = df.reset_index(drop=True) diff --git a/pandas/tests/plotting/test_misc.py b/pandas/tests/plotting/test_misc.py index 940cfef4058e0..c51cd0e92eb3c 100644 --- a/pandas/tests/plotting/test_misc.py +++ b/pandas/tests/plotting/test_misc.py @@ -266,7 +266,7 @@ def test_parallel_coordinates_with_sorted_labels(self): df = DataFrame( { - "feat": [i for i in range(30)], + "feat": list(range(30)), "class": [2 for _ in range(10)] + [3 for _ in range(10)] + [1 for _ in range(10)], @@ -279,8 +279,7 @@ def test_parallel_coordinates_with_sorted_labels(self): ) ordered_color_label_tuples = sorted(color_label_tuples, key=lambda x: x[1]) prev_next_tupels = zip( - [i for i in ordered_color_label_tuples[0:-1]], - [i for i in ordered_color_label_tuples[1:]], + list(ordered_color_label_tuples[0:-1]), list(ordered_color_label_tuples[1:]) ) for prev, nxt in prev_next_tupels: # labels and colors are ordered strictly increasing diff --git a/pandas/tests/reshape/merge/test_join.py b/pandas/tests/reshape/merge/test_join.py index 925eaac45045d..e477b7608ab93 100644 --- a/pandas/tests/reshape/merge/test_join.py +++ b/pandas/tests/reshape/merge/test_join.py @@ -624,7 +624,7 @@ def test_join_mixed_non_unique_index(self): def test_join_non_unique_period_index(self): # GH #16871 index = pd.period_range("2016-01-01", periods=16, freq="M") - df = DataFrame([i for i in range(len(index))], index=index, columns=["pnum"]) + df = DataFrame(list(range(len(index))), index=index, columns=["pnum"]) df2 = concat([df, df]) result = df.join(df2, how="inner", rsuffix="_df2") expected = DataFrame( diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py index dd51a1a6c8359..5f4e8323c7127 100644 --- a/pandas/tests/reshape/merge/test_merge.py +++ b/pandas/tests/reshape/merge/test_merge.py @@ -860,7 +860,7 @@ def test_merge_datetime64tz_with_dst_transition(self): def test_merge_non_unique_period_index(self): # GH #16871 index = pd.period_range("2016-01-01", periods=16, freq="M") - df = DataFrame([i for i in range(len(index))], index=index, columns=["pnum"]) + df = DataFrame(list(range(len(index))), index=index, columns=["pnum"]) df2 = concat([df, df]) result = df.merge(df2, left_index=True, right_index=True, how="inner") expected = DataFrame( diff --git a/pandas/tests/series/test_alter_axes.py b/pandas/tests/series/test_alter_axes.py index 5d74ad95be90d..7a24a45b4b6c2 100644 --- a/pandas/tests/series/test_alter_axes.py +++ b/pandas/tests/series/test_alter_axes.py @@ -233,7 +233,7 @@ def test_reorder_levels(self): def test_rename_axis_mapper(self): # GH 19978 mi = MultiIndex.from_product([["a", "b", "c"], [1, 2]], names=["ll", "nn"]) - s = Series([i for i in range(len(mi))], index=mi) + s = Series(list(range(len(mi))), index=mi) result = s.rename_axis(index={"ll": "foo"}) assert result.index.names == ["foo", "nn"] diff --git a/pandas/tests/series/test_api.py b/pandas/tests/series/test_api.py index 656bf5a0e8a44..00c66c8a13bd9 100644 --- a/pandas/tests/series/test_api.py +++ b/pandas/tests/series/test_api.py @@ -261,11 +261,11 @@ def test_tab_completion_with_categorical(self): def get_dir(s): results = [r for r in s.cat.__dir__() if not r.startswith("_")] - return list(sorted(set(results))) + return sorted(set(results)) s = Series(list("aabbcde")).astype("category") results = get_dir(s) - tm.assert_almost_equal(results, list(sorted(set(ok_for_cat)))) + tm.assert_almost_equal(results, sorted(set(ok_for_cat))) @pytest.mark.parametrize( "index", diff --git a/pandas/tests/series/test_datetime_values.py b/pandas/tests/series/test_datetime_values.py index 0be3c729cff91..d038df1747f73 100644 --- a/pandas/tests/series/test_datetime_values.py +++ b/pandas/tests/series/test_datetime_values.py @@ -208,20 +208,18 @@ def compare(s, name): # test limited display api def get_dir(s): results = [r for r in s.dt.__dir__() if not r.startswith("_")] - return list(sorted(set(results))) + return sorted(set(results)) s = Series(date_range("20130101", periods=5, freq="D"), name="xxx") results = get_dir(s) - tm.assert_almost_equal( - results, list(sorted(set(ok_for_dt + ok_for_dt_methods))) - ) + tm.assert_almost_equal(results, sorted(set(ok_for_dt + ok_for_dt_methods))) s = Series( period_range("20130101", periods=5, freq="D", name="xxx").astype(object) ) results = get_dir(s) tm.assert_almost_equal( - results, list(sorted(set(ok_for_period + ok_for_period_methods))) + results, sorted(set(ok_for_period + ok_for_period_methods)) ) # 11295 @@ -229,9 +227,7 @@ def get_dir(s): s = Series(pd.date_range("2015-01-01", "2016-01-01", freq="T"), name="xxx") s = s.dt.tz_localize("UTC").dt.tz_convert("America/Chicago") results = get_dir(s) - tm.assert_almost_equal( - results, list(sorted(set(ok_for_dt + ok_for_dt_methods))) - ) + tm.assert_almost_equal(results, sorted(set(ok_for_dt + ok_for_dt_methods))) exp_values = pd.date_range( "2015-01-01", "2016-01-01", freq="T", tz="UTC" ).tz_convert("America/Chicago") diff --git a/pandas/tests/series/test_duplicates.py b/pandas/tests/series/test_duplicates.py index 4a914e4fb0f2c..0f7e3e307ed19 100644 --- a/pandas/tests/series/test_duplicates.py +++ b/pandas/tests/series/test_duplicates.py @@ -85,7 +85,7 @@ def __ne__(self, other): with capsys.disabled(): li = [Foo(i) for i in range(5)] - s = Series(li, index=[i for i in range(5)]) + s = Series(li, index=list(range(5))) s.is_unique captured = capsys.readouterr() assert len(captured.err) == 0 diff --git a/pandas/tests/series/test_timeseries.py b/pandas/tests/series/test_timeseries.py index 4ae00bca3e832..cf06a9a7c8415 100644 --- a/pandas/tests/series/test_timeseries.py +++ b/pandas/tests/series/test_timeseries.py @@ -604,7 +604,7 @@ def test_asfreq_keep_index_name(self): # GH #9854 index_name = "bar" index = pd.date_range("20130101", periods=20, name=index_name) - df = pd.DataFrame([x for x in range(20)], columns=["foo"], index=index) + df = pd.DataFrame(list(range(20)), columns=["foo"], index=index) assert index_name == df.index.name assert index_name == df.asfreq("10D").index.name diff --git a/pandas/tests/test_base.py b/pandas/tests/test_base.py index 21fed62e51fdf..d9bdceb258592 100644 --- a/pandas/tests/test_base.py +++ b/pandas/tests/test_base.py @@ -653,7 +653,7 @@ def test_value_counts_datetime64(self, klass): # with NaT s = df["dt"].copy() - s = klass([v for v in s.values] + [pd.NaT]) + s = klass(list(s.values) + [pd.NaT]) result = s.value_counts() assert result.index.dtype == "datetime64[ns]" diff --git a/pandas/tests/test_multilevel.py b/pandas/tests/test_multilevel.py index cd6acafc394c5..4d0f0b57c65af 100644 --- a/pandas/tests/test_multilevel.py +++ b/pandas/tests/test_multilevel.py @@ -257,7 +257,7 @@ def test_repr_name_coincide(self): assert lines[2].startswith("a 0 foo") def test_delevel_infer_dtype(self): - tuples = [tuple for tuple in product(["foo", "bar"], [10, 20], [1.0, 1.1])] + tuples = list(product(["foo", "bar"], [10, 20], [1.0, 1.1])) index = MultiIndex.from_tuples(tuples, names=["prm0", "prm1", "prm2"]) df = DataFrame(np.random.randn(8, 3), columns=["A", "B", "C"], index=index) deleveled = df.reset_index() diff --git a/pandas/tests/tseries/offsets/test_offsets.py b/pandas/tests/tseries/offsets/test_offsets.py index bed8d2461f65d..e443a7cc932be 100644 --- a/pandas/tests/tseries/offsets/test_offsets.py +++ b/pandas/tests/tseries/offsets/test_offsets.py @@ -4253,7 +4253,7 @@ def test_valid_default_arguments(offset_types): cls() -@pytest.mark.parametrize("kwd", sorted(list(liboffsets.relativedelta_kwds))) +@pytest.mark.parametrize("kwd", sorted(liboffsets.relativedelta_kwds)) def test_valid_month_attributes(kwd, month_classes): # GH#18226 cls = month_classes @@ -4262,14 +4262,14 @@ def test_valid_month_attributes(kwd, month_classes): cls(**{kwd: 3}) -@pytest.mark.parametrize("kwd", sorted(list(liboffsets.relativedelta_kwds))) +@pytest.mark.parametrize("kwd", sorted(liboffsets.relativedelta_kwds)) def test_valid_relativedelta_kwargs(kwd): # Check that all the arguments specified in liboffsets.relativedelta_kwds # are in fact valid relativedelta keyword args DateOffset(**{kwd: 1}) -@pytest.mark.parametrize("kwd", sorted(list(liboffsets.relativedelta_kwds))) +@pytest.mark.parametrize("kwd", sorted(liboffsets.relativedelta_kwds)) def test_valid_tick_attributes(kwd, tick_classes): # GH#18226 cls = tick_classes diff --git a/pandas/util/testing.py b/pandas/util/testing.py index 2b4cb322fc966..bcd12eba1651a 100644 --- a/pandas/util/testing.py +++ b/pandas/util/testing.py @@ -1952,7 +1952,7 @@ def keyfunc(x): label = "{prefix}_l{i}_g{j}".format(prefix=prefix, i=i, j=j) cnt[label] = ndupe_l[i] # cute Counter trick - result = list(sorted(cnt.elements(), key=keyfunc))[:nentries] + result = sorted(cnt.elements(), key=keyfunc)[:nentries] tuples.append(result) tuples = list(zip(*tuples)) diff --git a/requirements-dev.txt b/requirements-dev.txt index 13e2c95126f0c..6235b61d92f29 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -6,7 +6,7 @@ cython>=0.29.13 black<=19.3b0 cpplint flake8 -flake8-comprehensions +flake8-comprehensions>=3.1.0 flake8-rst>=0.6.0,<=0.7.0 isort mypy==0.720 From cd013b41bbde9752bbf561eba4abfbc439e83c7a Mon Sep 17 00:00:00 2001 From: Marc Garcia Date: Sat, 16 Nov 2019 16:47:13 +0000 Subject: [PATCH 067/185] CI: Fix error when creating postgresql db (#29655) --- ci/setup_env.sh | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/ci/setup_env.sh b/ci/setup_env.sh index 4d454f9c5041a..0e8d6fb7cd35a 100755 --- a/ci/setup_env.sh +++ b/ci/setup_env.sh @@ -114,6 +114,11 @@ echo "w/o removing anything else" conda remove pandas -y --force || true pip uninstall -y pandas || true +echo +echo "remove postgres if has been installed with conda" +echo "we use the one from the CI" +conda remove postgresql -y --force || true + echo echo "conda list pandas" conda list pandas From 09e16d7a66247951c8596ff519f67b606075816c Mon Sep 17 00:00:00 2001 From: MomIsBestFriend <50263213+MomIsBestFriend@users.noreply.github.com> Date: Sat, 16 Nov 2019 22:30:04 +0200 Subject: [PATCH 068/185] F-strings (#29662) --- pandas/compat/__init__.py | 2 +- pandas/compat/_optional.py | 22 ++++++++---------- pandas/compat/numpy/__init__.py | 10 ++++----- pandas/compat/numpy/function.py | 40 ++++++++++++--------------------- 4 files changed, 29 insertions(+), 45 deletions(-) diff --git a/pandas/compat/__init__.py b/pandas/compat/__init__.py index 684fbbc23c86c..f95dd8679308f 100644 --- a/pandas/compat/__init__.py +++ b/pandas/compat/__init__.py @@ -30,7 +30,7 @@ def set_function_name(f, name, cls): Bind the name/qualname attributes of the function. """ f.__name__ = name - f.__qualname__ = "{klass}.{name}".format(klass=cls.__name__, name=name) + f.__qualname__ = f"{cls.__name__}.{name}" f.__module__ = cls.__module__ return f diff --git a/pandas/compat/_optional.py b/pandas/compat/_optional.py index 14425578786d7..fc66502710b0c 100644 --- a/pandas/compat/_optional.py +++ b/pandas/compat/_optional.py @@ -28,15 +28,6 @@ "xlsxwriter": "0.9.8", } -message = ( - "Missing optional dependency '{name}'. {extra} " - "Use pip or conda to install {name}." -) -version_message = ( - "Pandas requires version '{minimum_version}' or newer of '{name}' " - "(version '{actual_version}' currently installed)." -) - def _get_version(module: types.ModuleType) -> str: version = getattr(module, "__version__", None) @@ -45,7 +36,7 @@ def _get_version(module: types.ModuleType) -> str: version = getattr(module, "__VERSION__", None) if version is None: - raise ImportError("Can't determine version for {}".format(module.__name__)) + raise ImportError(f"Can't determine version for {module.__name__}") return version @@ -86,11 +77,15 @@ def import_optional_dependency( is False, or when the package's version is too old and `on_version` is ``'warn'``. """ + msg = ( + f"Missing optional dependency '{name}'. {extra} " + f"Use pip or conda to install {name}." + ) try: module = importlib.import_module(name) except ImportError: if raise_on_missing: - raise ImportError(message.format(name=name, extra=extra)) from None + raise ImportError(msg) from None else: return None @@ -99,8 +94,9 @@ def import_optional_dependency( version = _get_version(module) if distutils.version.LooseVersion(version) < minimum_version: assert on_version in {"warn", "raise", "ignore"} - msg = version_message.format( - minimum_version=minimum_version, name=name, actual_version=version + msg = ( + f"Pandas requires version '{minimum_version}' or newer of '{name}' " + f"(version '{version}' currently installed)." ) if on_version == "warn": warnings.warn(msg, UserWarning) diff --git a/pandas/compat/numpy/__init__.py b/pandas/compat/numpy/__init__.py index 402ed62f2df65..27f1c32058941 100644 --- a/pandas/compat/numpy/__init__.py +++ b/pandas/compat/numpy/__init__.py @@ -18,11 +18,11 @@ if _nlv < "1.13.3": raise ImportError( - "this version of pandas is incompatible with " - "numpy < 1.13.3\n" - "your numpy version is {0}.\n" - "Please upgrade numpy to >= 1.13.3 to use " - "this pandas version".format(_np_version) + f"this version of pandas is incompatible with " + f"numpy < 1.13.3\n" + f"your numpy version is {_np_version}.\n" + f"Please upgrade numpy to >= 1.13.3 to use " + f"this pandas version" ) diff --git a/pandas/compat/numpy/function.py b/pandas/compat/numpy/function.py index c2fe7d1dd12f4..ea5aaf6b6476d 100644 --- a/pandas/compat/numpy/function.py +++ b/pandas/compat/numpy/function.py @@ -58,9 +58,7 @@ def __call__(self, args, kwargs, fname=None, max_fname_arg_count=None, method=No fname, args, kwargs, max_fname_arg_count, self.defaults ) else: - raise ValueError( - "invalid validation method '{method}'".format(method=method) - ) + raise ValueError(f"invalid validation method '{method}'") ARGMINMAX_DEFAULTS = dict(out=None) @@ -312,9 +310,8 @@ def validate_take_with_convert(convert, args, kwargs): def validate_window_func(name, args, kwargs): numpy_args = ("axis", "dtype", "out") msg = ( - "numpy operations are not " - "valid with window objects. " - "Use .{func}() directly instead ".format(func=name) + f"numpy operations are not valid with window objects. " + f"Use .{name}() directly instead " ) if len(args) > 0: @@ -328,9 +325,8 @@ def validate_window_func(name, args, kwargs): def validate_rolling_func(name, args, kwargs): numpy_args = ("axis", "dtype", "out") msg = ( - "numpy operations are not " - "valid with window objects. " - "Use .rolling(...).{func}() instead ".format(func=name) + f"numpy operations are not valid with window objects. " + f"Use .rolling(...).{name}() instead " ) if len(args) > 0: @@ -344,9 +340,8 @@ def validate_rolling_func(name, args, kwargs): def validate_expanding_func(name, args, kwargs): numpy_args = ("axis", "dtype", "out") msg = ( - "numpy operations are not " - "valid with window objects. " - "Use .expanding(...).{func}() instead ".format(func=name) + f"numpy operations are not valid with window objects. " + f"Use .expanding(...).{name}() instead " ) if len(args) > 0: @@ -371,11 +366,9 @@ def validate_groupby_func(name, args, kwargs, allowed=None): if len(args) + len(kwargs) > 0: raise UnsupportedFunctionCall( - ( - "numpy operations are not valid " - "with groupby. Use .groupby(...)." - "{func}() instead".format(func=name) - ) + f"numpy operations are not valid with " + f"groupby. Use .groupby(...).{name}() " + f"instead" ) @@ -391,11 +384,9 @@ def validate_resampler_func(method, args, kwargs): if len(args) + len(kwargs) > 0: if method in RESAMPLER_NUMPY_OPS: raise UnsupportedFunctionCall( - ( - "numpy operations are not valid " - "with resample. Use .resample(...)." - "{func}() instead".format(func=method) - ) + f"numpy operations are not " + f"valid with resample. Use " + f".resample(...).{method}() instead" ) else: raise TypeError("too many arguments passed in") @@ -418,7 +409,4 @@ def validate_minmax_axis(axis): if axis is None: return if axis >= ndim or (axis < 0 and ndim + axis < 0): - raise ValueError( - "`axis` must be fewer than the number of " - "dimensions ({ndim})".format(ndim=ndim) - ) + raise ValueError(f"`axis` must be fewer than the number of dimensions ({ndim})") From 5ef1b321b418542cc2b7586c179bb341d18a0cf0 Mon Sep 17 00:00:00 2001 From: MomIsBestFriend <50263213+MomIsBestFriend@users.noreply.github.com> Date: Sat, 16 Nov 2019 22:35:19 +0200 Subject: [PATCH 069/185] F-string (#29663) --- asv_bench/benchmarks/io/csv.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/asv_bench/benchmarks/io/csv.py b/asv_bench/benchmarks/io/csv.py index adb3dd95e3574..b8e8630e663ee 100644 --- a/asv_bench/benchmarks/io/csv.py +++ b/asv_bench/benchmarks/io/csv.py @@ -132,7 +132,7 @@ class ReadCSVConcatDatetimeBadDateValue(StringIORewind): param_names = ["bad_date_value"] def setup(self, bad_date_value): - self.StringIO_input = StringIO(("%s,\n" % bad_date_value) * 50000) + self.StringIO_input = StringIO((f"{bad_date_value},\n") * 50000) def time_read_csv(self, bad_date_value): read_csv( From 67fc8a144837611035f258208ad77fb4e5309fe5 Mon Sep 17 00:00:00 2001 From: Marc Garcia Date: Sat, 16 Nov 2019 20:39:05 +0000 Subject: [PATCH 070/185] CI: Forcing GitHub actions to activate (#29661) --- .github/workflows/activate.yml | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) create mode 100644 .github/workflows/activate.yml diff --git a/.github/workflows/activate.yml b/.github/workflows/activate.yml new file mode 100644 index 0000000000000..f6aede6289ebf --- /dev/null +++ b/.github/workflows/activate.yml @@ -0,0 +1,21 @@ +# Simple first task to activate GitHub actions. +# This won't run until is merged, but future actions will +# run on PRs, so we can see we don't break things in more +# complex actions added later, like real builds. +# +# TODO: Remove this once another action exists +name: Activate + +on: + push: + branches: master + pull_request: + branches: master + +jobs: + activate: + name: Activate actions + runs-on: ubuntu-latest + steps: + - name: Activate + run: echo "GitHub actions ok" From e3e5b0d992f38d79029c254f412cf162b7444c71 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Sat, 16 Nov 2019 12:49:41 -0800 Subject: [PATCH 071/185] TYP: annotations in core.indexes (#29656) --- pandas/core/indexes/api.py | 29 ++++++++++--------- pandas/core/indexes/base.py | 40 +++++++++++++-------------- pandas/core/indexes/category.py | 6 ++-- pandas/core/indexes/datetimelike.py | 2 +- pandas/core/indexes/datetimes.py | 4 +-- pandas/core/indexes/interval.py | 16 +++++------ pandas/core/indexes/multi.py | 24 ++++++++-------- pandas/core/indexes/numeric.py | 12 ++++---- pandas/core/indexes/period.py | 10 ++++--- pandas/core/indexes/range.py | 12 ++++---- pandas/core/indexes/timedeltas.py | 6 ++-- pandas/core/internals/construction.py | 4 ++- pandas/core/reshape/concat.py | 10 ++----- pandas/core/reshape/pivot.py | 5 +++- pandas/tseries/frequencies.py | 2 +- 15 files changed, 94 insertions(+), 88 deletions(-) diff --git a/pandas/core/indexes/api.py b/pandas/core/indexes/api.py index a7cf2c20b0dec..f650a62bc5b74 100644 --- a/pandas/core/indexes/api.py +++ b/pandas/core/indexes/api.py @@ -1,4 +1,5 @@ import textwrap +from typing import List, Set import warnings from pandas._libs import NaT, lib @@ -64,7 +65,9 @@ ] -def get_objs_combined_axis(objs, intersect=False, axis=0, sort=True): +def get_objs_combined_axis( + objs, intersect: bool = False, axis=0, sort: bool = True +) -> Index: """ Extract combined index: return intersection or union (depending on the value of "intersect") of indexes on given axis, or None if all objects @@ -72,9 +75,8 @@ def get_objs_combined_axis(objs, intersect=False, axis=0, sort=True): Parameters ---------- - objs : list of objects - Each object will only be considered if it has a _get_axis - attribute. + objs : list + Series or DataFrame objects, may be mix of the two. intersect : bool, default False If True, calculate the intersection between indexes. Otherwise, calculate the union. @@ -87,26 +89,27 @@ def get_objs_combined_axis(objs, intersect=False, axis=0, sort=True): ------- Index """ - obs_idxes = [obj._get_axis(axis) for obj in objs if hasattr(obj, "_get_axis")] - if obs_idxes: - return _get_combined_index(obs_idxes, intersect=intersect, sort=sort) + obs_idxes = [obj._get_axis(axis) for obj in objs] + return _get_combined_index(obs_idxes, intersect=intersect, sort=sort) -def _get_distinct_objs(objs): +def _get_distinct_objs(objs: List[Index]) -> List[Index]: """ Return a list with distinct elements of "objs" (different ids). Preserves order. """ - ids = set() + ids: Set[int] = set() res = [] for obj in objs: - if not id(obj) in ids: + if id(obj) not in ids: ids.add(id(obj)) res.append(obj) return res -def _get_combined_index(indexes, intersect=False, sort=False): +def _get_combined_index( + indexes: List[Index], intersect: bool = False, sort: bool = False +) -> Index: """ Return the union or intersection of indexes. @@ -147,7 +150,7 @@ def _get_combined_index(indexes, intersect=False, sort=False): return index -def union_indexes(indexes, sort=True): +def union_indexes(indexes, sort=True) -> Index: """ Return the union of indexes. @@ -173,7 +176,7 @@ def union_indexes(indexes, sort=True): indexes, kind = _sanitize_and_check(indexes) - def _unique_indices(inds): + def _unique_indices(inds) -> Index: """ Convert indexes to lists and concatenate them, removing duplicates. diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 5b57d3f096b0c..699994964ab40 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -1650,7 +1650,7 @@ def _get_grouper_for_level(self, mapper, level=None): # Introspection Methods @property - def is_monotonic(self): + def is_monotonic(self) -> bool: """ Alias for is_monotonic_increasing. """ @@ -1691,7 +1691,7 @@ def is_monotonic_decreasing(self) -> bool: return self._engine.is_monotonic_decreasing @property - def _is_strictly_monotonic_increasing(self): + def _is_strictly_monotonic_increasing(self) -> bool: """ Return if the index is strictly monotonic increasing (only increasing) values. @@ -1708,7 +1708,7 @@ def _is_strictly_monotonic_increasing(self): return self.is_unique and self.is_monotonic_increasing @property - def _is_strictly_monotonic_decreasing(self): + def _is_strictly_monotonic_decreasing(self) -> bool: """ Return if the index is strictly monotonic decreasing (only decreasing) values. @@ -1725,7 +1725,7 @@ def _is_strictly_monotonic_decreasing(self): return self.is_unique and self.is_monotonic_decreasing @cache_readonly - def is_unique(self): + def is_unique(self) -> bool: """ Return if the index has unique values. """ @@ -1735,22 +1735,22 @@ def is_unique(self): def has_duplicates(self) -> bool: return not self.is_unique - def is_boolean(self): + def is_boolean(self) -> bool: return self.inferred_type in ["boolean"] - def is_integer(self): + def is_integer(self) -> bool: return self.inferred_type in ["integer"] - def is_floating(self): + def is_floating(self) -> bool: return self.inferred_type in ["floating", "mixed-integer-float", "integer-na"] - def is_numeric(self): + def is_numeric(self) -> bool: return self.inferred_type in ["integer", "floating"] - def is_object(self): + def is_object(self) -> bool: return is_object_dtype(self.dtype) - def is_categorical(self): + def is_categorical(self) -> bool: """ Check if the Index holds categorical data. @@ -1786,10 +1786,10 @@ def is_categorical(self): """ return self.inferred_type in ["categorical"] - def is_interval(self): + def is_interval(self) -> bool: return self.inferred_type in ["interval"] - def is_mixed(self): + def is_mixed(self) -> bool: return self.inferred_type in ["mixed"] def holds_integer(self): @@ -1868,7 +1868,7 @@ def _isnan(self): @cache_readonly def _nan_idxs(self): if self._can_hold_na: - w, = self._isnan.nonzero() + w = self._isnan.nonzero()[0] return w else: return np.array([], dtype=np.int64) @@ -4086,13 +4086,13 @@ def _assert_can_do_op(self, value): msg = "'value' must be a scalar, passed: {0}" raise TypeError(msg.format(type(value).__name__)) - def _is_memory_usage_qualified(self): + def _is_memory_usage_qualified(self) -> bool: """ Return a boolean if we need a qualified .info display. """ return self.is_object() - def is_type_compatible(self, kind): + def is_type_compatible(self, kind) -> bool: """ Whether the index type is compatible with the provided type. """ @@ -4131,14 +4131,14 @@ def is_type_compatible(self, kind): """ @Appender(_index_shared_docs["contains"] % _index_doc_kwargs) - def __contains__(self, key): + def __contains__(self, key) -> bool: hash(key) try: return key in self._engine except (OverflowError, TypeError, ValueError): return False - def contains(self, key): + def contains(self, key) -> bool: """ Return a boolean indicating whether the provided key is in the index. @@ -4199,7 +4199,7 @@ def __getitem__(self, key): else: return result - def _can_hold_identifiers_and_holds_name(self, name): + def _can_hold_identifiers_and_holds_name(self, name) -> bool: """ Faster check for ``name in self`` when we know `name` is a Python identifier (e.g. in NDFrame.__getattr__, which hits this to support @@ -4290,7 +4290,7 @@ def putmask(self, mask, value): # coerces to object return self.astype(object).putmask(mask, value) - def equals(self, other): + def equals(self, other) -> bool: """ Determine if two Index objects contain the same elements. @@ -4314,7 +4314,7 @@ def equals(self, other): com.values_from_object(self), com.values_from_object(other) ) - def identical(self, other): + def identical(self, other) -> bool: """ Similar to equals, but check that other comparable attributes are also equal. diff --git a/pandas/core/indexes/category.py b/pandas/core/indexes/category.py index 49bb705e09469..819f8ac53197a 100644 --- a/pandas/core/indexes/category.py +++ b/pandas/core/indexes/category.py @@ -276,7 +276,7 @@ def _shallow_copy(self, values=None, dtype=None, **kwargs): dtype = self.dtype return super()._shallow_copy(values=values, dtype=dtype, **kwargs) - def _is_dtype_compat(self, other): + def _is_dtype_compat(self, other) -> bool: """ *this is an internal non-public method* @@ -407,7 +407,7 @@ def _reverse_indexer(self): return self._data._reverse_indexer() @Appender(_index_shared_docs["contains"] % _index_doc_kwargs) - def __contains__(self, key): + def __contains__(self, key) -> bool: # if key is a NaN, check if any NaN is in self. if is_scalar(key) and isna(key): return self.hasnans @@ -455,7 +455,7 @@ def _engine(self): # introspection @cache_readonly - def is_unique(self): + def is_unique(self) -> bool: return self._engine.is_unique @property diff --git a/pandas/core/indexes/datetimelike.py b/pandas/core/indexes/datetimelike.py index f694b85f1ca5d..ceb23f61ae15a 100644 --- a/pandas/core/indexes/datetimelike.py +++ b/pandas/core/indexes/datetimelike.py @@ -148,7 +148,7 @@ def wrapper(self, other): return wrapper @property - def _ndarray_values(self): + def _ndarray_values(self) -> np.ndarray: return self._data._ndarray_values # ------------------------------------------------------------------------ diff --git a/pandas/core/indexes/datetimes.py b/pandas/core/indexes/datetimes.py index aee9be20a1593..41f5eb90d51b0 100644 --- a/pandas/core/indexes/datetimes.py +++ b/pandas/core/indexes/datetimes.py @@ -410,7 +410,7 @@ def tz(self, value): tzinfo = tz @cache_readonly - def _is_dates_only(self): + def _is_dates_only(self) -> bool: """Return a boolean if we are only dates (and don't have a timezone)""" from pandas.io.formats.format import _is_dates_only @@ -1237,7 +1237,7 @@ def searchsorted(self, value, side="left", sorter=None): return self.values.searchsorted(value, side=side) - def is_type_compatible(self, typ): + def is_type_compatible(self, typ) -> bool: return typ == self.inferred_type or typ == "datetime" @property diff --git a/pandas/core/indexes/interval.py b/pandas/core/indexes/interval.py index 4a75ab58b7a65..35e8405e0f1aa 100644 --- a/pandas/core/indexes/interval.py +++ b/pandas/core/indexes/interval.py @@ -343,7 +343,7 @@ def _engine(self): right = self._maybe_convert_i8(self.right) return IntervalTree(left, right, closed=self.closed) - def __contains__(self, key): + def __contains__(self, key) -> bool: """ return a boolean if this key is IN the index We *only* accept an Interval @@ -483,7 +483,7 @@ def _values(self): return self._data @cache_readonly - def _ndarray_values(self): + def _ndarray_values(self) -> np.ndarray: return np.array(self._data) def __array__(self, result=None): @@ -529,7 +529,7 @@ def inferred_type(self) -> str: return "interval" @Appender(Index.memory_usage.__doc__) - def memory_usage(self, deep=False): + def memory_usage(self, deep: bool = False) -> int: # we don't use an explicit engine # so return the bytes here return self.left.memory_usage(deep=deep) + self.right.memory_usage(deep=deep) @@ -542,7 +542,7 @@ def mid(self): return self._data.mid @cache_readonly - def is_monotonic(self): + def is_monotonic(self) -> bool: """ Return True if the IntervalIndex is monotonic increasing (only equal or increasing values), else False @@ -550,7 +550,7 @@ def is_monotonic(self): return self.is_monotonic_increasing @cache_readonly - def is_monotonic_increasing(self): + def is_monotonic_increasing(self) -> bool: """ Return True if the IntervalIndex is monotonic increasing (only equal or increasing values), else False @@ -1213,7 +1213,7 @@ def _format_space(self): def argsort(self, *args, **kwargs): return np.lexsort((self.right, self.left)) - def equals(self, other): + def equals(self, other) -> bool: """ Determines if two IntervalIndex objects contain the same elements """ @@ -1374,7 +1374,7 @@ def is_all_dates(self) -> bool: IntervalIndex._add_logical_methods_disabled() -def _is_valid_endpoint(endpoint): +def _is_valid_endpoint(endpoint) -> bool: """helper for interval_range to check if start/end are valid types""" return any( [ @@ -1386,7 +1386,7 @@ def _is_valid_endpoint(endpoint): ) -def _is_type_compatible(a, b): +def _is_type_compatible(a, b) -> bool: """helper for interval_range to check type compat of start/end/freq""" is_ts_compat = lambda x: isinstance(x, (Timestamp, DateOffset)) is_td_compat = lambda x: isinstance(x, (Timedelta, DateOffset)) diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 7b02a99263266..f3a735511c96b 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -1025,7 +1025,7 @@ def _shallow_copy_with_infer(self, values, **kwargs): return self._shallow_copy(values, **kwargs) @Appender(_index_shared_docs["contains"] % _index_doc_kwargs) - def __contains__(self, key): + def __contains__(self, key) -> bool: hash(key) try: self.get_loc(key) @@ -1043,10 +1043,10 @@ def _shallow_copy(self, values=None, **kwargs): return self.copy(**kwargs) @cache_readonly - def dtype(self): + def dtype(self) -> np.dtype: return np.dtype("O") - def _is_memory_usage_qualified(self): + def _is_memory_usage_qualified(self) -> bool: """ return a boolean if we need a qualified .info display """ def f(l): @@ -1055,18 +1055,18 @@ def f(l): return any(f(l) for l in self._inferred_type_levels) @Appender(Index.memory_usage.__doc__) - def memory_usage(self, deep=False): + def memory_usage(self, deep: bool = False) -> int: # we are overwriting our base class to avoid # computing .values here which could materialize # a tuple representation unnecessarily return self._nbytes(deep) @cache_readonly - def nbytes(self): + def nbytes(self) -> int: """ return the number of bytes in the underlying data """ return self._nbytes(False) - def _nbytes(self, deep=False): + def _nbytes(self, deep: bool = False) -> int: """ return the number of bytes in the underlying data deeply introspect the level data if deep=True @@ -1325,7 +1325,7 @@ def _constructor(self): def inferred_type(self) -> str: return "mixed" - def _get_level_number(self, level): + def _get_level_number(self, level) -> int: count = self.names.count(level) if (count > 1) and not is_integer(level): raise ValueError( @@ -1397,7 +1397,7 @@ def values(self): return self._tuples @cache_readonly - def is_monotonic_increasing(self): + def is_monotonic_increasing(self) -> bool: """ return if the index is monotonic increasing (only equal or increasing) values. @@ -1789,7 +1789,7 @@ def to_flat_index(self): def is_all_dates(self) -> bool: return False - def is_lexsorted(self): + def is_lexsorted(self) -> bool: """ Return True if the codes are lexicographically sorted. @@ -3126,7 +3126,7 @@ def truncate(self, before=None, after=None): return MultiIndex(levels=new_levels, codes=new_codes, verify_integrity=False) - def equals(self, other): + def equals(self, other) -> bool: """ Determines if two MultiIndex objects have the same labeling information (the levels themselves do not necessarily have to be the same) @@ -3459,7 +3459,7 @@ def isin(self, values, level=None): MultiIndex._add_logical_methods_disabled() -def _sparsify(label_list, start=0, sentinel=""): +def _sparsify(label_list, start: int = 0, sentinel=""): pivoted = list(zip(*label_list)) k = len(label_list) @@ -3487,7 +3487,7 @@ def _sparsify(label_list, start=0, sentinel=""): return list(zip(*result)) -def _get_na_rep(dtype): +def _get_na_rep(dtype) -> str: return {np.datetime64: "NaT", np.timedelta64: "NaT"}.get(dtype, "NaN") diff --git a/pandas/core/indexes/numeric.py b/pandas/core/indexes/numeric.py index 3e2b41f62f30b..ee96e4cd699bb 100644 --- a/pandas/core/indexes/numeric.py +++ b/pandas/core/indexes/numeric.py @@ -206,7 +206,7 @@ class IntegerIndex(NumericIndex): This is an abstract class for Int64Index, UInt64Index. """ - def __contains__(self, key): + def __contains__(self, key) -> bool: """ Check if key is a float and has a decimal. If it has, return False. """ @@ -233,7 +233,7 @@ def inferred_type(self) -> str: return "integer" @property - def asi8(self): + def asi8(self) -> np.ndarray: # do not cache or you'll create a memory leak return self.values.view("i8") @@ -288,7 +288,7 @@ def inferred_type(self) -> str: return "integer" @property - def asi8(self): + def asi8(self) -> np.ndarray: # do not cache or you'll create a memory leak return self.values.view("u8") @@ -425,7 +425,7 @@ def get_value(self, series, key): return new_values - def equals(self, other): + def equals(self, other) -> bool: """ Determines if two Index objects contain the same elements. """ @@ -447,7 +447,7 @@ def equals(self, other): except (TypeError, ValueError): return False - def __contains__(self, other): + def __contains__(self, other) -> bool: if super().__contains__(other): return True @@ -482,7 +482,7 @@ def get_loc(self, key, method=None, tolerance=None): return super().get_loc(key, method=method, tolerance=tolerance) @cache_readonly - def is_unique(self): + def is_unique(self) -> bool: return super().is_unique and self._nan_idxs.size < 2 @Appender(Index.isin.__doc__) diff --git a/pandas/core/indexes/period.py b/pandas/core/indexes/period.py index 2df58b0bbc105..cae1380e930f1 100644 --- a/pandas/core/indexes/period.py +++ b/pandas/core/indexes/period.py @@ -310,7 +310,7 @@ def values(self): return np.asarray(self) @property - def freq(self): + def freq(self) -> DateOffset: return self._data.freq @freq.setter @@ -447,7 +447,7 @@ def _engine(self): return self._engine_type(period, len(self)) @Appender(_index_shared_docs["contains"]) - def __contains__(self, key): + def __contains__(self, key) -> bool: if isinstance(key, Period): if key.freq != self.freq: return False @@ -578,7 +578,7 @@ def is_all_dates(self) -> bool: return True @property - def is_full(self): + def is_full(self) -> bool: """ Returns True if this PeriodIndex is range-like in that all Periods between start and end are present, in order. @@ -995,7 +995,9 @@ def memory_usage(self, deep=False): PeriodIndex._add_datetimelike_methods() -def period_range(start=None, end=None, periods=None, freq=None, name=None): +def period_range( + start=None, end=None, periods=None, freq=None, name=None +) -> PeriodIndex: """ Return a fixed frequency PeriodIndex. diff --git a/pandas/core/indexes/range.py b/pandas/core/indexes/range.py index 6f806c5bab6e4..d200ff6a71264 100644 --- a/pandas/core/indexes/range.py +++ b/pandas/core/indexes/range.py @@ -302,7 +302,7 @@ def _step(self): return self.step @cache_readonly - def nbytes(self): + def nbytes(self) -> int: """ Return the number of bytes in the underlying data. """ @@ -312,7 +312,7 @@ def nbytes(self): for attr_name in ["start", "stop", "step"] ) - def memory_usage(self, deep=False): + def memory_usage(self, deep: bool = False) -> int: """ Memory usage of my values @@ -338,16 +338,16 @@ def memory_usage(self, deep=False): return self.nbytes @property - def dtype(self): + def dtype(self) -> np.dtype: return np.dtype(np.int64) @property - def is_unique(self): + def is_unique(self) -> bool: """ return if the index has unique values """ return True @cache_readonly - def is_monotonic_increasing(self): + def is_monotonic_increasing(self) -> bool: return self._range.step > 0 or len(self) <= 1 @cache_readonly @@ -703,7 +703,7 @@ def __len__(self) -> int: return len(self._range) @property - def size(self): + def size(self) -> int: return len(self) def __getitem__(self, key): diff --git a/pandas/core/indexes/timedeltas.py b/pandas/core/indexes/timedeltas.py index 6caac43af163b..1fd824235c2be 100644 --- a/pandas/core/indexes/timedeltas.py +++ b/pandas/core/indexes/timedeltas.py @@ -604,7 +604,7 @@ def searchsorted(self, value, side="left", sorter=None): return self.values.searchsorted(value, side=side, sorter=sorter) - def is_type_compatible(self, typ): + def is_type_compatible(self, typ) -> bool: return typ == self.inferred_type or typ == "timedelta" @property @@ -699,7 +699,7 @@ def delete(self, loc): TimedeltaIndex._add_datetimelike_methods() -def _is_convertible_to_index(other): +def _is_convertible_to_index(other) -> bool: """ return a boolean whether I can attempt conversion to a TimedeltaIndex """ @@ -719,7 +719,7 @@ def _is_convertible_to_index(other): def timedelta_range( start=None, end=None, periods=None, freq=None, name=None, closed=None -): +) -> TimedeltaIndex: """ Return a fixed frequency TimedeltaIndex, with day as the default frequency. diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py index 2980deb9a052c..6d518aa1abeb9 100644 --- a/pandas/core/internals/construction.py +++ b/pandas/core/internals/construction.py @@ -489,7 +489,9 @@ def _list_to_arrays(data, columns, coerce_float=False, dtype=None): def _list_of_series_to_arrays(data, columns, coerce_float=False, dtype=None): if columns is None: - columns = get_objs_combined_axis(data, sort=False) + # We know pass_data is non-empty because data[0] is a Series + pass_data = [x for x in data if isinstance(x, (ABCSeries, ABCDataFrame))] + columns = get_objs_combined_axis(pass_data, sort=False) indexer_cache = {} diff --git a/pandas/core/reshape/concat.py b/pandas/core/reshape/concat.py index 3efe8072d3323..3e8d19096a36e 100644 --- a/pandas/core/reshape/concat.py +++ b/pandas/core/reshape/concat.py @@ -522,13 +522,9 @@ def _get_new_axes(self): def _get_comb_axis(self, i): data_axis = self.objs[0]._get_block_manager_axis(i) - try: - return get_objs_combined_axis( - self.objs, axis=data_axis, intersect=self.intersect, sort=self.sort - ) - except IndexError: - types = [type(x).__name__ for x in self.objs] - raise TypeError("Cannot concatenate list of {types}".format(types=types)) + return get_objs_combined_axis( + self.objs, axis=data_axis, intersect=self.intersect, sort=self.sort + ) def _get_concat_axis(self): """ diff --git a/pandas/core/reshape/pivot.py b/pandas/core/reshape/pivot.py index 9ac27b0450bbe..0626420d9c114 100644 --- a/pandas/core/reshape/pivot.py +++ b/pandas/core/reshape/pivot.py @@ -541,7 +541,10 @@ def crosstab( rownames = _get_names(index, rownames, prefix="row") colnames = _get_names(columns, colnames, prefix="col") - common_idx = get_objs_combined_axis(index + columns, intersect=True, sort=False) + common_idx = None + pass_objs = [x for x in index + columns if isinstance(x, (ABCSeries, ABCDataFrame))] + if pass_objs: + common_idx = get_objs_combined_axis(pass_objs, intersect=True, sort=False) data = {} data.update(zip(rownames, index)) diff --git a/pandas/tseries/frequencies.py b/pandas/tseries/frequencies.py index c237b094a0e01..9ec0dce438099 100644 --- a/pandas/tseries/frequencies.py +++ b/pandas/tseries/frequencies.py @@ -308,7 +308,7 @@ def deltas_asi8(self): return unique_deltas(self.index.asi8) @cache_readonly - def is_unique(self): + def is_unique(self) -> bool: return len(self.deltas) == 1 @cache_readonly From 4ab6ee2fe6ea0eda8031efb040c095a83775d311 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Sat, 16 Nov 2019 12:55:57 -0800 Subject: [PATCH 072/185] TYP: core.computation mostly (#29652) --- pandas/_libs/internals.pyx | 6 +++--- pandas/_libs/sparse.pyx | 12 +++++------ pandas/_libs/writers.pyx | 2 +- pandas/core/computation/align.py | 2 +- pandas/core/computation/engines.py | 9 ++++---- pandas/core/computation/ops.py | 33 +++++++++++++++-------------- pandas/core/computation/pytables.py | 26 +++++++++++++---------- pandas/core/computation/scope.py | 25 +++++++++++----------- 8 files changed, 61 insertions(+), 54 deletions(-) diff --git a/pandas/_libs/internals.pyx b/pandas/_libs/internals.pyx index 08decb44a8a53..8e61a772912af 100644 --- a/pandas/_libs/internals.pyx +++ b/pandas/_libs/internals.pyx @@ -85,7 +85,7 @@ cdef class BlockPlacement: return iter(self._as_array) @property - def as_slice(self): + def as_slice(self) -> slice: cdef: slice s = self._ensure_has_slice() if s is None: @@ -118,7 +118,7 @@ cdef class BlockPlacement: return self._as_array @property - def is_slice_like(self): + def is_slice_like(self) -> bool: cdef: slice s = self._ensure_has_slice() return s is not None @@ -441,7 +441,7 @@ def get_blkno_indexers(int64_t[:] blknos, bint group=True): yield blkno, result -def get_blkno_placements(blknos, group=True): +def get_blkno_placements(blknos, group: bool = True): """ Parameters diff --git a/pandas/_libs/sparse.pyx b/pandas/_libs/sparse.pyx index 578995a3eb3b6..ee83901040b36 100644 --- a/pandas/_libs/sparse.pyx +++ b/pandas/_libs/sparse.pyx @@ -57,7 +57,7 @@ cdef class IntIndex(SparseIndex): return output @property - def nbytes(self): + def nbytes(self) -> int: return self.indices.nbytes def check_integrity(self): @@ -91,7 +91,7 @@ cdef class IntIndex(SparseIndex): if not monotonic: raise ValueError("Indices must be strictly increasing") - def equals(self, other): + def equals(self, other) -> bool: if not isinstance(other, IntIndex): return False @@ -103,7 +103,7 @@ cdef class IntIndex(SparseIndex): return same_length and same_indices @property - def ngaps(self): + def ngaps(self) -> int: return self.length - self.npoints def to_int_index(self): @@ -348,11 +348,11 @@ cdef class BlockIndex(SparseIndex): return output @property - def nbytes(self): + def nbytes(self) -> int: return self.blocs.nbytes + self.blengths.nbytes @property - def ngaps(self): + def ngaps(self) -> int: return self.length - self.npoints cpdef check_integrity(self): @@ -388,7 +388,7 @@ cdef class BlockIndex(SparseIndex): if blengths[i] == 0: raise ValueError(f'Zero-length block {i}') - def equals(self, other): + def equals(self, other) -> bool: if not isinstance(other, BlockIndex): return False diff --git a/pandas/_libs/writers.pyx b/pandas/_libs/writers.pyx index 1775893b9f2bf..73201e75c3c88 100644 --- a/pandas/_libs/writers.pyx +++ b/pandas/_libs/writers.pyx @@ -70,7 +70,7 @@ def write_csv_rows(list data, ndarray data_index, @cython.boundscheck(False) @cython.wraparound(False) -def convert_json_to_lines(object arr): +def convert_json_to_lines(arr: object) -> str: """ replace comma separated json with line feeds, paying special attention to quotes & brackets diff --git a/pandas/core/computation/align.py b/pandas/core/computation/align.py index 3e1e5ed89d877..dfb858d797f41 100644 --- a/pandas/core/computation/align.py +++ b/pandas/core/computation/align.py @@ -33,7 +33,7 @@ def _zip_axes_from_type(typ, new_axes): return axes -def _any_pandas_objects(terms): +def _any_pandas_objects(terms) -> bool: """Check a sequence of terms for instances of PandasObject.""" return any(isinstance(term.value, PandasObject) for term in terms) diff --git a/pandas/core/computation/engines.py b/pandas/core/computation/engines.py index dc6378e83d229..513eb0fd7f2a6 100644 --- a/pandas/core/computation/engines.py +++ b/pandas/core/computation/engines.py @@ -46,8 +46,9 @@ def __init__(self, expr): self.aligned_axes = None self.result_type = None - def convert(self): - """Convert an expression for evaluation. + def convert(self) -> str: + """ + Convert an expression for evaluation. Defaults to return the expression as a string. """ @@ -75,7 +76,7 @@ def evaluate(self): ) @property - def _is_aligned(self): + def _is_aligned(self) -> bool: return self.aligned_axes is not None and self.result_type is not None @abc.abstractmethod @@ -104,7 +105,7 @@ class NumExprEngine(AbstractEngine): def __init__(self, expr): super().__init__(expr) - def convert(self): + def convert(self) -> str: return str(super().convert()) def _evaluate(self): diff --git a/pandas/core/computation/ops.py b/pandas/core/computation/ops.py index 8fab5bd87d4fe..0fdbdda30ad35 100644 --- a/pandas/core/computation/ops.py +++ b/pandas/core/computation/ops.py @@ -70,6 +70,7 @@ def __new__(cls, name, env, side=None, encoding=None): return supr_new(klass) def __init__(self, name, env, side=None, encoding=None): + # name is a str for Term, but may be something else for subclasses self._name = name self.env = env self.side = side @@ -79,7 +80,7 @@ def __init__(self, name, env, side=None, encoding=None): self.encoding = encoding @property - def local_name(self): + def local_name(self) -> str: return self.name.replace(_LOCAL_TAG, "") def __repr__(self) -> str: @@ -120,7 +121,7 @@ def update(self, value): self.value = value @property - def is_scalar(self): + def is_scalar(self) -> bool: return is_scalar(self._value) @property @@ -139,14 +140,14 @@ def type(self): return_type = type @property - def raw(self): + def raw(self) -> str: return pprint_thing( "{0}(name={1!r}, type={2})" "".format(self.__class__.__name__, self.name, self.type) ) @property - def is_datetime(self): + def is_datetime(self) -> bool: try: t = self.type.type except AttributeError: @@ -220,7 +221,7 @@ def return_type(self): return _result_type_many(*(term.type for term in com.flatten(self))) @property - def has_invalid_return_type(self): + def has_invalid_return_type(self) -> bool: types = self.operand_types obj_dtype_set = frozenset([np.dtype("object")]) return self.return_type == object and types - obj_dtype_set @@ -230,11 +231,11 @@ def operand_types(self): return frozenset(term.type for term in com.flatten(self)) @property - def is_scalar(self): + def is_scalar(self) -> bool: return all(operand.is_scalar for operand in self.operands) @property - def is_datetime(self): + def is_datetime(self) -> bool: try: t = self.return_type.type except AttributeError: @@ -339,7 +340,7 @@ def _cast_inplace(terms, acceptable_dtypes, dtype): term.update(new_value) -def is_term(obj): +def is_term(obj) -> bool: return isinstance(obj, Term) @@ -354,7 +355,7 @@ class BinOp(Op): right : Term or Op """ - def __init__(self, op, lhs, rhs, **kwargs): + def __init__(self, op: str, lhs, rhs, **kwargs): super().__init__(op, (lhs, rhs)) self.lhs = lhs self.rhs = rhs @@ -396,7 +397,7 @@ def __call__(self, env): return self.func(left, right) - def evaluate(self, env, engine, parser, term_type, eval_in_python): + def evaluate(self, env, engine: str, parser, term_type, eval_in_python): """ Evaluate a binary operation *before* being passed to the engine. @@ -488,7 +489,7 @@ def _disallow_scalar_only_bool_ops(self): raise NotImplementedError("cannot evaluate scalar only bool ops") -def isnumeric(dtype): +def isnumeric(dtype) -> bool: return issubclass(np.dtype(dtype).type, np.number) @@ -505,8 +506,8 @@ class Div(BinOp): regardless of the value of ``truediv``. """ - def __init__(self, lhs, rhs, truediv, *args, **kwargs): - super().__init__("/", lhs, rhs, *args, **kwargs) + def __init__(self, lhs, rhs, truediv: bool, **kwargs): + super().__init__("/", lhs, rhs, **kwargs) if not isnumeric(lhs.return_type) or not isnumeric(rhs.return_type): raise TypeError( @@ -541,7 +542,7 @@ class UnaryOp(Op): * If no function associated with the passed operator token is found. """ - def __init__(self, op, operand): + def __init__(self, op: str, operand): super().__init__(op, (operand,)) self.operand = operand @@ -561,7 +562,7 @@ def __repr__(self) -> str: return pprint_thing("{0}({1})".format(self.op, self.operand)) @property - def return_type(self): + def return_type(self) -> np.dtype: operand = self.operand if operand.return_type == np.dtype("bool"): return np.dtype("bool") @@ -588,7 +589,7 @@ def __repr__(self) -> str: class FuncNode: - def __init__(self, name): + def __init__(self, name: str): from pandas.core.computation.check import _NUMEXPR_INSTALLED, _NUMEXPR_VERSION if name not in _mathops or ( diff --git a/pandas/core/computation/pytables.py b/pandas/core/computation/pytables.py index 3a2ea30cbc8b9..13a4814068d6a 100644 --- a/pandas/core/computation/pytables.py +++ b/pandas/core/computation/pytables.py @@ -2,6 +2,7 @@ import ast from functools import partial +from typing import Optional import numpy as np @@ -129,12 +130,12 @@ def conform(self, rhs): return rhs @property - def is_valid(self): + def is_valid(self) -> bool: """ return True if this is a valid field """ return self.lhs in self.queryables @property - def is_in_table(self): + def is_in_table(self) -> bool: """ return True if this is a valid column name for generation (e.g. an actual column in the table) """ return self.queryables.get(self.lhs) is not None @@ -154,12 +155,12 @@ def metadata(self): """ the metadata of my field """ return getattr(self.queryables.get(self.lhs), "metadata", None) - def generate(self, v): + def generate(self, v) -> str: """ create and return the op string for this TermValue """ val = v.tostring(self.encoding) return "({lhs} {op} {val})".format(lhs=self.lhs, op=self.op, val=val) - def convert_value(self, v): + def convert_value(self, v) -> "TermValue": """ convert the expression that is in the term to something that is accepted by pytables """ @@ -279,7 +280,7 @@ def evaluate(self): return self - def generate_filter_op(self, invert=False): + def generate_filter_op(self, invert: bool = False): if (self.op == "!=" and not invert) or (self.op == "==" and invert): return lambda axis, vals: ~axis.isin(vals) else: @@ -505,7 +506,7 @@ class Expr(expr.Expr): "major_axis>=20130101" """ - def __init__(self, where, queryables=None, encoding=None, scope_level=0): + def __init__(self, where, queryables=None, encoding=None, scope_level: int = 0): where = _validate_where(where) @@ -520,18 +521,21 @@ def __init__(self, where, queryables=None, encoding=None, scope_level=0): if isinstance(where, Expr): local_dict = where.env.scope - where = where.expr + _where = where.expr elif isinstance(where, (list, tuple)): + where = list(where) for idx, w in enumerate(where): if isinstance(w, Expr): local_dict = w.env.scope else: w = _validate_where(w) where[idx] = w - where = " & ".join(map("({})".format, com.flatten(where))) # noqa + _where = " & ".join(map("({})".format, com.flatten(where))) + else: + _where = where - self.expr = where + self.expr = _where self.env = Scope(scope_level + 1, local_dict=local_dict) if queryables is not None and isinstance(self.expr, str): @@ -574,7 +578,7 @@ def evaluate(self): class TermValue: """ hold a term value the we use to construct a condition/filter """ - def __init__(self, value, converted, kind): + def __init__(self, value, converted, kind: Optional[str]): self.value = value self.converted = converted self.kind = kind @@ -593,7 +597,7 @@ def tostring(self, encoding): return self.converted -def maybe_expression(s): +def maybe_expression(s) -> bool: """ loose checking if s is a pytables-acceptable expression """ if not isinstance(s, str): return False diff --git a/pandas/core/computation/scope.py b/pandas/core/computation/scope.py index 81c7b04bf3284..ee82664f6cb21 100644 --- a/pandas/core/computation/scope.py +++ b/pandas/core/computation/scope.py @@ -29,7 +29,7 @@ def _ensure_scope( ) -def _replacer(x): +def _replacer(x) -> str: """Replace a number with its hexadecimal representation. Used to tag temporary variables with their calling scope's id. """ @@ -44,11 +44,11 @@ def _replacer(x): return hex(hexin) -def _raw_hex_id(obj): +def _raw_hex_id(obj) -> str: """Return the padded hexadecimal id of ``obj``.""" # interpret as a pointer since that's what really what id returns packed = struct.pack("@P", id(obj)) - return "".join(map(_replacer, packed)) + return "".join(_replacer(x) for x in packed) _DEFAULT_GLOBALS = { @@ -63,7 +63,7 @@ def _raw_hex_id(obj): } -def _get_pretty_string(obj): +def _get_pretty_string(obj) -> str: """ Return a prettier version of obj. @@ -74,7 +74,7 @@ def _get_pretty_string(obj): Returns ------- - s : str + str Pretty print object repr """ sio = StringIO() @@ -148,8 +148,9 @@ def __repr__(self) -> str: ) @property - def has_resolvers(self): - """Return whether we have any extra scope. + def has_resolvers(self) -> bool: + """ + Return whether we have any extra scope. For example, DataFrames pass Their columns as resolvers during calls to ``DataFrame.eval()`` and ``DataFrame.query()``. @@ -250,13 +251,13 @@ def _get_vars(self, stack, scopes): # scope after the loop del frame - def update(self, level): + def update(self, level: int): """ Update the current scope by going back `level` levels. Parameters ---------- - level : int or None, optional, default None + level : int """ sl = level + 1 @@ -270,7 +271,7 @@ def update(self, level): finally: del stack[:], stack - def add_tmp(self, value): + def add_tmp(self, value) -> str: """ Add a temporary variable to the scope. @@ -281,7 +282,7 @@ def add_tmp(self, value): Returns ------- - name : basestring + str The name of the temporary variable created. """ name = "{name}_{num}_{hex_id}".format( @@ -297,7 +298,7 @@ def add_tmp(self, value): return name @property - def ntemps(self): + def ntemps(self) -> int: """The number of temporary variables in this scope""" return len(self.temps) From ded50fd01d9f737f81d486e5b3d0f2cac959c325 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Sat, 16 Nov 2019 12:58:20 -0800 Subject: [PATCH 073/185] REF: remove unnecessary _try_cast calls (#29642) --- pandas/core/groupby/generic.py | 2 +- pandas/core/groupby/groupby.py | 8 -------- 2 files changed, 1 insertion(+), 9 deletions(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 9073a1e31dfb0..f6c4836632795 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -397,7 +397,7 @@ def _aggregate_named(self, func, *args, **kwargs): output = func(group, *args, **kwargs) if isinstance(output, (Series, Index, np.ndarray)): raise ValueError("Must produce aggregated value") - result[name] = self._try_cast(output, group) + result[name] = output return result diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 280f1e88b0ea8..746fb21c5776e 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -1357,14 +1357,6 @@ def f(self, **kwargs): # apply a non-cython aggregation result = self.aggregate(lambda x: npfunc(x, axis=self.axis)) - - # coerce the resulting columns if we can - if isinstance(result, DataFrame): - for col in result.columns: - result[col] = self._try_cast(result[col], self.obj[col]) - else: - result = self._try_cast(result, self.obj) - return result set_function_name(f, name, cls) From 29be383d1ff9355cd7b3031f595201af863406e5 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Sat, 16 Nov 2019 13:06:45 -0800 Subject: [PATCH 074/185] REF: simplify _iterate_slices (#29629) --- pandas/core/groupby/generic.py | 24 ++++++------------------ pandas/core/groupby/groupby.py | 19 ++++++++++++------- 2 files changed, 18 insertions(+), 25 deletions(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index f6c4836632795..6376dbefcf435 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -10,19 +10,7 @@ from functools import partial from textwrap import dedent import typing -from typing import ( - Any, - Callable, - FrozenSet, - Hashable, - Iterable, - Optional, - Sequence, - Tuple, - Type, - Union, - cast, -) +from typing import Any, Callable, FrozenSet, Iterable, Sequence, Type, Union, cast import warnings import numpy as np @@ -142,8 +130,8 @@ def pinner(cls): class SeriesGroupBy(GroupBy): _apply_whitelist = base.series_apply_whitelist - def _iterate_slices(self) -> Iterable[Tuple[Optional[Hashable], Series]]: - yield self._selection_name, self._selected_obj + def _iterate_slices(self) -> Iterable[Series]: + yield self._selected_obj @property def _selection_name(self): @@ -923,20 +911,20 @@ def aggregate(self, func=None, *args, **kwargs): agg = aggregate - def _iterate_slices(self) -> Iterable[Tuple[Optional[Hashable], Series]]: + def _iterate_slices(self) -> Iterable[Series]: obj = self._selected_obj if self.axis == 1: obj = obj.T if isinstance(obj, Series) and obj.name not in self.exclusions: # Occurs when doing DataFrameGroupBy(...)["X"] - yield obj.name, obj + yield obj else: for label, values in obj.items(): if label in self.exclusions: continue - yield label, values + yield values def _cython_agg_general( self, how: str, alt=None, numeric_only: bool = True, min_count: int = -1 diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 746fb21c5776e..294cb723eee1a 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -14,7 +14,7 @@ class providing the base-class of operations. import inspect import re import types -from typing import FrozenSet, Hashable, Iterable, List, Optional, Tuple, Type, Union +from typing import FrozenSet, Iterable, List, Optional, Tuple, Type, Union import numpy as np @@ -439,7 +439,7 @@ def _get_indices(self, names): def get_converter(s): # possibly convert to the actual key types # in the indices, could be a Timestamp or a np.datetime64 - if isinstance(s, (Timestamp, datetime.datetime)): + if isinstance(s, datetime.datetime): return lambda key: Timestamp(key) elif isinstance(s, np.datetime64): return lambda key: Timestamp(key).asm8 @@ -488,6 +488,7 @@ def _get_index(self, name): @cache_readonly def _selected_obj(self): + # Note: _selected_obj is always just `self.obj` for SeriesGroupBy if self._selection is None or isinstance(self.obj, Series): if self._group_selection is not None: @@ -736,7 +737,7 @@ def _python_apply_general(self, f): keys, values, not_indexed_same=mutated or self.mutated ) - def _iterate_slices(self) -> Iterable[Tuple[Optional[Hashable], Series]]: + def _iterate_slices(self) -> Iterable[Series]: raise AbstractMethodError(self) def transform(self, func, *args, **kwargs): @@ -832,7 +833,8 @@ def _transform_should_cast(self, func_nm: str) -> bool: def _cython_transform(self, how: str, numeric_only: bool = True, **kwargs): output = collections.OrderedDict() # type: dict - for name, obj in self._iterate_slices(): + for obj in self._iterate_slices(): + name = obj.name is_numeric = is_numeric_dtype(obj.dtype) if numeric_only and not is_numeric: continue @@ -864,7 +866,8 @@ def _cython_agg_general( self, how: str, alt=None, numeric_only: bool = True, min_count: int = -1 ): output = {} - for name, obj in self._iterate_slices(): + for obj in self._iterate_slices(): + name = obj.name is_numeric = is_numeric_dtype(obj.dtype) if numeric_only and not is_numeric: continue @@ -883,7 +886,8 @@ def _python_agg_general(self, func, *args, **kwargs): # iterate through "columns" ex exclusions to populate output dict output = {} - for name, obj in self._iterate_slices(): + for obj in self._iterate_slices(): + name = obj.name if self.grouper.ngroups == 0: # agg_series below assumes ngroups > 0 continue @@ -2234,7 +2238,8 @@ def _get_cythonized_result( output = collections.OrderedDict() # type: dict base_func = getattr(libgroupby, how) - for name, obj in self._iterate_slices(): + for obj in self._iterate_slices(): + name = obj.name values = obj._data._values if aggregate: From e639af2afd18b90ab9063df9c1927ae1f357a418 Mon Sep 17 00:00:00 2001 From: Asish Mahapatra Date: Sat, 16 Nov 2019 16:35:55 -0500 Subject: [PATCH 075/185] BUG: Appending empty list to DataFrame #28769 (#28834) --- doc/source/whatsnew/v1.0.0.rst | 1 + pandas/core/frame.py | 11 +++++++---- pandas/tests/frame/test_combine_concat.py | 14 ++++++++++++++ 3 files changed, 22 insertions(+), 4 deletions(-) diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index 896ae91c68642..f1ed6f0844a29 100644 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -465,6 +465,7 @@ Other - :meth:`Series.append` will no longer raise a ``TypeError`` when passed a tuple of ``Series`` (:issue:`28410`) - :meth:`SeriesGroupBy.value_counts` will be able to handle the case even when the :class:`Grouper` makes empty groups (:issue: 28479) - Fix corrupted error message when calling ``pandas.libs._json.encode()`` on a 0d array (:issue:`18878`) +- Bug in :meth:`DataFrame.append` that raised ``IndexError`` when appending with empty list (:issue:`28769`) - Fix :class:`AbstractHolidayCalendar` to return correct results for years after 2030 (now goes up to 2200) (:issue:`27790`) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 9c9189e5f8316..442994a04caee 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -6943,10 +6943,13 @@ def append(self, other, ignore_index=False, verify_integrity=False, sort=None): other = other._convert(datetime=True, timedelta=True) if not self.columns.equals(combined_columns): self = self.reindex(columns=combined_columns) - elif isinstance(other, list) and not isinstance(other[0], DataFrame): - other = DataFrame(other) - if (self.columns.get_indexer(other.columns) >= 0).all(): - other = other.reindex(columns=self.columns) + elif isinstance(other, list): + if not other: + pass + elif not isinstance(other[0], DataFrame): + other = DataFrame(other) + if (self.columns.get_indexer(other.columns) >= 0).all(): + other = other.reindex(columns=self.columns) from pandas.core.reshape.concat import concat diff --git a/pandas/tests/frame/test_combine_concat.py b/pandas/tests/frame/test_combine_concat.py index e3f37e1ef3186..12d06dc517f19 100644 --- a/pandas/tests/frame/test_combine_concat.py +++ b/pandas/tests/frame/test_combine_concat.py @@ -128,6 +128,20 @@ def test_concat_tuple_keys(self): ) tm.assert_frame_equal(results, expected) + def test_append_empty_list(self): + # GH 28769 + df = DataFrame() + result = df.append([]) + expected = df + tm.assert_frame_equal(result, expected) + assert result is not df + + df = DataFrame(np.random.randn(5, 4), columns=["foo", "bar", "baz", "qux"]) + result = df.append([]) + expected = df + tm.assert_frame_equal(result, expected) + assert result is not df # .append() should return a new object + def test_append_series_dict(self): df = DataFrame(np.random.randn(5, 4), columns=["foo", "bar", "baz", "qux"]) From fb08ceeeeba2ba62f92b47d424b3ae83c20ed9db Mon Sep 17 00:00:00 2001 From: Justin Zheng Date: Sat, 16 Nov 2019 13:54:01 -0800 Subject: [PATCH 076/185] BUG-26988 implement replace for categorical blocks (#27026) --- doc/source/whatsnew/v1.0.0.rst | 1 + pandas/core/arrays/categorical.py | 45 +++++++++++++++++++ pandas/core/internals/blocks.py | 24 ++++++++++ pandas/tests/arrays/categorical/test_algos.py | 18 ++++++++ pandas/tests/frame/test_replace.py | 18 ++++++++ pandas/tests/series/test_replace.py | 23 ++++++++++ 6 files changed, 129 insertions(+) diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index f1ed6f0844a29..a5b459f3372e6 100644 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -298,6 +298,7 @@ Categorical - Using date accessors on a categorical dtyped :class:`Series` of datetimes was not returning an object of the same type as if one used the :meth:`.str.` / :meth:`.dt.` on a :class:`Series` of that type. E.g. when accessing :meth:`Series.dt.tz_localize` on a :class:`Categorical` with duplicate entries, the accessor was skipping duplicates (:issue: `27952`) +- Bug in :meth:`DataFrame.replace` and :meth:`Series.replace` that would give incorrect results on categorical data (:issue:`26988`) Datetimelike diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 53689b6bc2eba..c6e2a7b7a6e00 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -2470,6 +2470,51 @@ def isin(self, values): code_values = code_values[null_mask | (code_values >= 0)] return algorithms.isin(self.codes, code_values) + def replace(self, to_replace, value, inplace: bool = False): + """ + Replaces all instances of one value with another + + Parameters + ---------- + to_replace: object + The value to be replaced + + value: object + The value to replace it with + + inplace: bool + Whether the operation is done in-place + + Returns + ------- + None if inplace is True, otherwise the new Categorical after replacement + + + Examples + -------- + >>> s = pd.Categorical([1, 2, 1, 3]) + >>> s.replace(1, 3) + [3, 3, 2, 3] + Categories (2, int64): [2, 3] + """ + inplace = validate_bool_kwarg(inplace, "inplace") + cat = self if inplace else self.copy() + if to_replace in cat.categories: + if isna(value): + cat.remove_categories(to_replace, inplace=True) + else: + categories = cat.categories.tolist() + index = categories.index(to_replace) + if value in cat.categories: + value_index = categories.index(value) + cat._codes[cat._codes == index] = value_index + cat.remove_categories(to_replace, inplace=True) + else: + categories[index] = value + cat.rename_categories(categories, inplace=True) + if not inplace: + return cat + # The Series.cat accessor diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 38e1f241c1d77..7ace80415c846 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -2924,6 +2924,30 @@ def where( ) return result + def replace( + self, + to_replace, + value, + inplace: bool = False, + filter=None, + regex: bool = False, + convert: bool = True, + ): + inplace = validate_bool_kwarg(inplace, "inplace") + result = self if inplace else self.copy() + if filter is None: # replace was called on a series + result.values.replace(to_replace, value, inplace=True) + if convert: + return result.convert(numeric=False, copy=not inplace) + else: + return result + else: # replace was called on a DataFrame + if not isna(value): + result.values.add_categories(value, inplace=True) + return super(CategoricalBlock, result).replace( + to_replace, value, inplace, filter, regex, convert + ) + # ----------------------------------------------------------------- # Constructor Helpers diff --git a/pandas/tests/arrays/categorical/test_algos.py b/pandas/tests/arrays/categorical/test_algos.py index 1a48ccf85f947..e076015c5f61d 100644 --- a/pandas/tests/arrays/categorical/test_algos.py +++ b/pandas/tests/arrays/categorical/test_algos.py @@ -59,6 +59,24 @@ def test_isin_cats(): tm.assert_numpy_array_equal(expected, result) +@pytest.mark.parametrize( + "to_replace, value, result", + [("b", "c", ["a", "c"]), ("c", "d", ["a", "b"]), ("b", None, ["a", None])], +) +def test_replace(to_replace, value, result): + # GH 26988 + cat = pd.Categorical(["a", "b"]) + expected = pd.Categorical(result) + result = cat.replace(to_replace, value) + tm.assert_categorical_equal(result, expected) + if to_replace == "b": # the "c" test is supposed to be unchanged + with pytest.raises(AssertionError): + # ensure non-inplace call does not affect original + tm.assert_categorical_equal(cat, expected) + cat.replace(to_replace, value, inplace=True) + tm.assert_categorical_equal(cat, expected) + + @pytest.mark.parametrize("empty", [[], pd.Series(), np.array([])]) def test_isin_empty(empty): s = pd.Categorical(["a", "b"]) diff --git a/pandas/tests/frame/test_replace.py b/pandas/tests/frame/test_replace.py index c30efa121262f..60b601b57e007 100644 --- a/pandas/tests/frame/test_replace.py +++ b/pandas/tests/frame/test_replace.py @@ -1296,6 +1296,24 @@ def test_replace_method(self, to_replace, method, expected): expected = DataFrame(expected) tm.assert_frame_equal(result, expected) + @pytest.mark.parametrize( + "replace_dict, final_data", + [({"a": 1, "b": 1}, [[3, 3], [2, 2]]), ({"a": 1, "b": 2}, [[3, 1], [2, 3]])], + ) + def test_categorical_replace_with_dict(self, replace_dict, final_data): + # GH 26988 + df = DataFrame([[1, 1], [2, 2]], columns=["a", "b"], dtype="category") + expected = DataFrame(final_data, columns=["a", "b"], dtype="category") + expected["a"] = expected["a"].cat.set_categories([1, 2, 3]) + expected["b"] = expected["b"].cat.set_categories([1, 2, 3]) + result = df.replace(replace_dict, 3) + tm.assert_frame_equal(result, expected) + with pytest.raises(AssertionError): + # ensure non-inplace call does not affect original + tm.assert_frame_equal(df, expected) + df.replace(replace_dict, 3, inplace=True) + tm.assert_frame_equal(df, expected) + @pytest.mark.parametrize( "df, to_replace, exp", [ diff --git a/pandas/tests/series/test_replace.py b/pandas/tests/series/test_replace.py index ebfd468e034f9..8018ecf03960c 100644 --- a/pandas/tests/series/test_replace.py +++ b/pandas/tests/series/test_replace.py @@ -293,6 +293,29 @@ def test_replace_categorical(self, categorical, numeric): expected = pd.Series(numeric) tm.assert_series_equal(expected, result, check_dtype=False) + def test_replace_categorical_single(self): + # GH 26988 + dti = pd.date_range("2016-01-01", periods=3, tz="US/Pacific") + s = pd.Series(dti) + c = s.astype("category") + + expected = c.copy() + expected = expected.cat.add_categories("foo") + expected[2] = "foo" + expected = expected.cat.remove_unused_categories() + assert c[2] != "foo" + + result = c.replace(c[2], "foo") + tm.assert_series_equal(expected, result) + assert c[2] != "foo" # ensure non-inplace call does not alter original + + c.replace(c[2], "foo", inplace=True) + tm.assert_series_equal(expected, c) + + first_value = c[0] + c.replace(c[1], c[0], inplace=True) + assert c[0] == c[1] == first_value # test replacing with existing value + def test_replace_with_no_overflowerror(self): # GH 25616 # casts to object without Exception from OverflowError From 7a13fc417d79d0a07e043fee4afce4d41f8db305 Mon Sep 17 00:00:00 2001 From: Ryan Nazareth Date: Sat, 16 Nov 2019 22:00:32 +0000 Subject: [PATCH 077/185] TST: Check error raised when inserting wrong length categorical column (#29523) --- pandas/tests/frame/test_indexing.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/pandas/tests/frame/test_indexing.py b/pandas/tests/frame/test_indexing.py index 61bf91d3018eb..e37f734c6235e 100644 --- a/pandas/tests/frame/test_indexing.py +++ b/pandas/tests/frame/test_indexing.py @@ -3881,3 +3881,11 @@ def test_loc_indexing_preserves_index_category_dtype(self): result = df.loc[["a"]].index.levels[0] tm.assert_index_equal(result, expected) + + def test_wrong_length_cat_dtype_raises(self): + # GH29523 + cat = pd.Categorical.from_codes([0, 1, 1, 0, 1, 2], ["a", "b", "c"]) + df = pd.DataFrame({"bar": range(10)}) + err = "Length of values does not match length of index" + with pytest.raises(ValueError, match=err): + df["foo"] = cat From a9abadcfcc4121dca45df0168afd833f41b8ed74 Mon Sep 17 00:00:00 2001 From: Ryan Nazareth Date: Sat, 16 Nov 2019 22:04:44 +0000 Subject: [PATCH 078/185] TST: Merge multiple cols with mixed columns/index (#29522) --- pandas/tests/reshape/merge/test_multi.py | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/pandas/tests/reshape/merge/test_multi.py b/pandas/tests/reshape/merge/test_multi.py index 1d8d2add3840c..bce62571d55ec 100644 --- a/pandas/tests/reshape/merge/test_multi.py +++ b/pandas/tests/reshape/merge/test_multi.py @@ -195,6 +195,27 @@ def test_merge_right_vs_left(self, left, right, sort): tm.assert_frame_equal(merged_left_right, merge_right_left) + def test_merge_multiple_cols_with_mixed_cols_index(self): + # GH29522 + s = pd.Series( + range(6), + pd.MultiIndex.from_product([["A", "B"], [1, 2, 3]], names=["lev1", "lev2"]), + name="Amount", + ) + df = pd.DataFrame( + {"lev1": list("AAABBB"), "lev2": [1, 2, 3, 1, 2, 3], "col": 0} + ) + result = pd.merge(df, s.reset_index(), on=["lev1", "lev2"]) + expected = pd.DataFrame( + { + "lev1": list("AAABBB"), + "lev2": [1, 2, 3, 1, 2, 3], + "col": [0] * 6, + "Amount": range(6), + } + ) + tm.assert_frame_equal(result, expected) + def test_compress_group_combinations(self): # ~ 40000000 possible unique groups From 94412eebd218b346ec4db0453f4cc8c31f10233d Mon Sep 17 00:00:00 2001 From: ganevgv Date: Sun, 17 Nov 2019 00:06:09 +0200 Subject: [PATCH 079/185] TST: new test for subset of a MultiIndex dtype (#29356) --- pandas/tests/test_multilevel.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/pandas/tests/test_multilevel.py b/pandas/tests/test_multilevel.py index 4d0f0b57c65af..f0928820367e9 100644 --- a/pandas/tests/test_multilevel.py +++ b/pandas/tests/test_multilevel.py @@ -1989,6 +1989,15 @@ def test_repeat(self): m_df = Series(data, index=m_idx) assert m_df.repeat(3).shape == (3 * len(data),) + def test_subsets_multiindex_dtype(self): + # GH 20757 + data = [["x", 1]] + columns = [("a", "b", np.nan), ("a", "c", 0.0)] + df = DataFrame(data, columns=pd.MultiIndex.from_tuples(columns)) + expected = df.dtypes.a.b + result = df.a.b.dtypes + tm.assert_series_equal(result, expected) + class TestSorted(Base): """ everything you wanted to test about sorting """ From ef10c895e8cea708755e10c1a6dfdeba1b9c4b23 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Sat, 16 Nov 2019 19:34:13 -0600 Subject: [PATCH 080/185] DOC: Update MultiIndex.names whatsnew (#29572) --- doc/source/user_guide/advanced.rst | 25 +++++++++++++++++ doc/source/whatsnew/v1.0.0.rst | 45 +++++++++++++++++------------- 2 files changed, 50 insertions(+), 20 deletions(-) diff --git a/doc/source/user_guide/advanced.rst b/doc/source/user_guide/advanced.rst index c6eadd2adadce..31bb71064d735 100644 --- a/doc/source/user_guide/advanced.rst +++ b/doc/source/user_guide/advanced.rst @@ -554,6 +554,31 @@ index. Both ``rename`` and ``rename_axis`` support specifying a dictionary, ``Series`` or a mapping function to map labels/names to new values. +When working with an ``Index`` object directly, rather than via a ``DataFrame``, +:meth:`Index.set_names` can be used to change the names. + +.. ipython:: python + + mi = pd.MultiIndex.from_product([[1, 2], ['a', 'b']], names=['x', 'y']) + mi.names + + mi2 = mi.rename("new name", level=0) + mi2 + +.. warning:: + + Prior to pandas 1.0.0, you could also set the names of a ``MultiIndex`` + by updating the name of a level. + + .. code-block:: none + + >>> mi.levels[0].name = 'name via level' + >>> mi.names[0] # only works for older panads + 'name via level' + + As of pandas 1.0, this will *silently* fail to update the names + of the MultiIndex. Use :meth:`Index.set_names` instead. + Sorting a ``MultiIndex`` ------------------------ diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index a5b459f3372e6..c91ced1014dd1 100644 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -130,34 +130,39 @@ Backwards incompatible API changes .. _whatsnew_1000.api_breaking.MultiIndex._names: -``MultiIndex.levels`` do not hold level names any longer -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +Avoid using names from ``MultiIndex.levels`` +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -- A :class:`MultiIndex` previously stored the level names as attributes of each of its - :attr:`MultiIndex.levels`. From Pandas 1.0, the names are only accessed through - :attr:`MultiIndex.names` (which was also possible previously). This is done in order to - make :attr:`MultiIndex.levels` more similar to :attr:`CategoricalIndex.categories` (:issue:`27242`:). +As part of a larger refactor to :class:`MultiIndex` the level names are now +stored separately from the levels (:issue:`27242`). We recommend using +:attr:`MultiIndex.names` to access the names, and :meth:`Index.set_names` +to update the names. -*pandas 0.25.x* +For backwards compatibility, you can still *access* the names via the levels. -.. code-block:: ipython +.. ipython:: python - In [1]: mi = pd.MultiIndex.from_product([[1, 2], ['a', 'b']], names=['x', 'y']) - Out[2]: mi - MultiIndex([(1, 'a'), - (1, 'b'), - (2, 'a'), - (2, 'b')], - names=['x', 'y']) - Out[3]: mi.levels[0].name - 'x' + mi = pd.MultiIndex.from_product([[1, 2], ['a', 'b']], names=['x', 'y']) + mi.levels[0].name -*pandas 1.0.0* +However, it is no longer possible to *update* the names of the ``MultiIndex`` +via the name of the level. The following will **silently** fail to update the +name of the ``MultiIndex`` .. ipython:: python - mi = pd.MultiIndex.from_product([[1, 2], ['a', 'b']], names=['x', 'y']) - mi.levels[0].name + mi.levels[0].name = "new name" + mi.names + +To update, use ``MultiIndex.set_names``, which returns a new ``MultiIndex``. + +.. ipython:: python + + mi2 = mi.set_names("new name", level=0) + mi2.names + +New repr for :class:`pandas.core.arrays.IntervalArray` +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - :class:`pandas.core.arrays.IntervalArray` adopts a new ``__repr__`` in accordance with other array classes (:issue:`25022`) From ab5731f7a0e835dc97d47eee8703c9f196c25097 Mon Sep 17 00:00:00 2001 From: gfyoung Date: Sat, 16 Nov 2019 19:52:29 -0800 Subject: [PATCH 081/185] MAINT: Grammar fix in activate.yml --- .github/workflows/activate.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/activate.yml b/.github/workflows/activate.yml index f6aede6289ebf..83838ea3d3941 100644 --- a/.github/workflows/activate.yml +++ b/.github/workflows/activate.yml @@ -1,5 +1,5 @@ # Simple first task to activate GitHub actions. -# This won't run until is merged, but future actions will +# This won't run until it is merged, but future actions will # run on PRs, so we can see we don't break things in more # complex actions added later, like real builds. # From 797732ae1165ea982e3d226b9441332af9ffc699 Mon Sep 17 00:00:00 2001 From: Terji Petersen Date: Sun, 17 Nov 2019 10:23:43 +0000 Subject: [PATCH 082/185] CLN: Make non-empty **kwargs in Index.__new__ fail instead of silently dropped (#29625) --- pandas/core/indexes/base.py | 15 ++++++++--- .../tests/indexes/multi/test_constructor.py | 7 +++-- .../tests/indexes/multi/test_equivalence.py | 5 +++- pandas/tests/indexes/test_base.py | 5 ++++ pandas/tests/indexing/test_coercion.py | 26 +++++++++---------- pandas/tests/io/excel/test_readers.py | 2 +- 6 files changed, 37 insertions(+), 23 deletions(-) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 699994964ab40..a8c7100b3846a 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -54,6 +54,7 @@ ABCDatetimeArray, ABCDatetimeIndex, ABCIndexClass, + ABCIntervalIndex, ABCMultiIndex, ABCPandasArray, ABCPeriodIndex, @@ -450,7 +451,9 @@ def __new__( return PeriodIndex(subarr, name=name, **kwargs) except IncompatibleFrequency: pass - return cls._simple_new(subarr, name) + if kwargs: + raise TypeError(f"Unexpected keyword arguments {set(kwargs)!r}") + return cls._simple_new(subarr, name, **kwargs) elif hasattr(data, "__array__"): return Index(np.asarray(data), dtype=dtype, copy=copy, name=name, **kwargs) @@ -3391,7 +3394,7 @@ def _reindex_non_unique(self, target): new_indexer = np.arange(len(self.take(indexer))) new_indexer[~check] = -1 - new_index = self._shallow_copy_with_infer(new_labels, freq=None) + new_index = self._shallow_copy_with_infer(new_labels) return new_index, indexer, new_indexer # -------------------------------------------------------------------- @@ -4254,7 +4257,13 @@ def _concat_same_dtype(self, to_concat, name): Concatenate to_concat which has the same class. """ # must be overridden in specific classes - klasses = (ABCDatetimeIndex, ABCTimedeltaIndex, ABCPeriodIndex, ExtensionArray) + klasses = ( + ABCDatetimeIndex, + ABCTimedeltaIndex, + ABCPeriodIndex, + ExtensionArray, + ABCIntervalIndex, + ) to_concat = [ x.astype(object) if isinstance(x, klasses) else x for x in to_concat ] diff --git a/pandas/tests/indexes/multi/test_constructor.py b/pandas/tests/indexes/multi/test_constructor.py index c32adf275ac98..d2c95b12d5339 100644 --- a/pandas/tests/indexes/multi/test_constructor.py +++ b/pandas/tests/indexes/multi/test_constructor.py @@ -609,12 +609,11 @@ def test_create_index_existing_name(idx): ("qux", "two"), ], dtype="object", - ), - names=["foo", "bar"], + ) ) tm.assert_index_equal(result, expected) - result = pd.Index(index, names=["A", "B"]) + result = pd.Index(index, name="A") expected = Index( Index( [ @@ -627,7 +626,7 @@ def test_create_index_existing_name(idx): ], dtype="object", ), - names=["A", "B"], + name="A", ) tm.assert_index_equal(result, expected) diff --git a/pandas/tests/indexes/multi/test_equivalence.py b/pandas/tests/indexes/multi/test_equivalence.py index f61ba0132ab97..c81af5a0c6c49 100644 --- a/pandas/tests/indexes/multi/test_equivalence.py +++ b/pandas/tests/indexes/multi/test_equivalence.py @@ -146,7 +146,10 @@ def test_identical(idx): assert mi.identical(mi2) mi3 = Index(mi.tolist(), names=mi.names) - mi4 = Index(mi.tolist(), names=mi.names, tupleize_cols=False) + msg = r"Unexpected keyword arguments {'names'}" + with pytest.raises(TypeError, match=msg): + Index(mi.tolist(), names=mi.names, tupleize_cols=False) + mi4 = Index(mi.tolist(), tupleize_cols=False) assert mi.identical(mi3) assert not mi.identical(mi4) assert mi.equals(mi4) diff --git a/pandas/tests/indexes/test_base.py b/pandas/tests/indexes/test_base.py index 8ffceb491aa86..5bfa13c0865f1 100644 --- a/pandas/tests/indexes/test_base.py +++ b/pandas/tests/indexes/test_base.py @@ -350,6 +350,11 @@ def test_constructor_simple_new(self, vals, dtype): result = index._simple_new(index.values, dtype) tm.assert_index_equal(result, index) + def test_constructor_wrong_kwargs(self): + # GH #19348 + with pytest.raises(TypeError, match="Unexpected keyword arguments {'foo'}"): + Index([], foo="bar") + @pytest.mark.parametrize( "vals", [ diff --git a/pandas/tests/indexing/test_coercion.py b/pandas/tests/indexing/test_coercion.py index 4f38d7beb9c0b..469c011001467 100644 --- a/pandas/tests/indexing/test_coercion.py +++ b/pandas/tests/indexing/test_coercion.py @@ -479,22 +479,20 @@ def test_insert_index_period(self, insert, coerced_val, coerced_dtype): obj = pd.PeriodIndex(["2011-01", "2011-02", "2011-03", "2011-04"], freq="M") assert obj.dtype == "period[M]" + data = [ + pd.Period("2011-01", freq="M"), + coerced_val, + pd.Period("2011-02", freq="M"), + pd.Period("2011-03", freq="M"), + pd.Period("2011-04", freq="M"), + ] if isinstance(insert, pd.Period): - index_type = pd.PeriodIndex + exp = pd.PeriodIndex(data, freq="M") + self._assert_insert_conversion(obj, insert, exp, coerced_dtype) else: - index_type = pd.Index - - exp = index_type( - [ - pd.Period("2011-01", freq="M"), - coerced_val, - pd.Period("2011-02", freq="M"), - pd.Period("2011-03", freq="M"), - pd.Period("2011-04", freq="M"), - ], - freq="M", - ) - self._assert_insert_conversion(obj, insert, exp, coerced_dtype) + msg = r"Unexpected keyword arguments {'freq'}" + with pytest.raises(TypeError, match=msg): + pd.Index(data, freq="M") def test_insert_index_complex128(self): pass diff --git a/pandas/tests/io/excel/test_readers.py b/pandas/tests/io/excel/test_readers.py index 70a86c2233513..d1611eebe2059 100644 --- a/pandas/tests/io/excel/test_readers.py +++ b/pandas/tests/io/excel/test_readers.py @@ -893,7 +893,7 @@ def test_excel_passes_na_filter(self, read_ext, na_filter): def test_unexpected_kwargs_raises(self, read_ext, arg): # gh-17964 kwarg = {arg: "Sheet1"} - msg = "unexpected keyword argument `{}`".format(arg) + msg = r"unexpected keyword argument `{}`".format(arg) with pd.ExcelFile("test1" + read_ext) as excel: with pytest.raises(TypeError, match=msg): From d3bfcdc0594a4900176cd08238ab97c8145c9fe3 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Sun, 17 Nov 2019 05:34:25 -0800 Subject: [PATCH 083/185] CLN: de-privatize names in core.computation (#29665) --- pandas/core/computation/align.py | 24 ++++++++++++++---------- pandas/core/computation/common.py | 2 +- pandas/core/computation/engines.py | 6 +++--- pandas/core/computation/eval.py | 14 ++++++-------- pandas/core/computation/expr.py | 7 ++++--- pandas/core/computation/ops.py | 4 ++-- 6 files changed, 30 insertions(+), 27 deletions(-) diff --git a/pandas/core/computation/align.py b/pandas/core/computation/align.py index dfb858d797f41..197ddd999fd37 100644 --- a/pandas/core/computation/align.py +++ b/pandas/core/computation/align.py @@ -8,10 +8,11 @@ from pandas.errors import PerformanceWarning -import pandas as pd +from pandas.core.dtypes.generic import ABCDataFrame, ABCSeries + from pandas.core.base import PandasObject import pandas.core.common as com -from pandas.core.computation.common import _result_type_many +from pandas.core.computation.common import result_type_many def _align_core_single_unary_op(term): @@ -49,7 +50,7 @@ def wrapper(terms): # we don't have any pandas objects if not _any_pandas_objects(terms): - return _result_type_many(*term_values), None + return result_type_many(*term_values), None return f(terms) @@ -60,7 +61,10 @@ def wrapper(terms): def _align_core(terms): term_index = [i for i, term in enumerate(terms) if hasattr(term.value, "axes")] term_dims = [terms[i].value.ndim for i in term_index] - ndims = pd.Series(dict(zip(term_index, term_dims))) + + from pandas import Series + + ndims = Series(dict(zip(term_index, term_dims))) # initial axes are the axes of the largest-axis'd term biggest = terms[ndims.idxmax()].value @@ -70,7 +74,7 @@ def _align_core(terms): gt_than_one_axis = naxes > 1 for value in (terms[i].value for i in term_index): - is_series = isinstance(value, pd.Series) + is_series = isinstance(value, ABCSeries) is_series_and_gt_one_axis = is_series and gt_than_one_axis for axis, items in enumerate(value.axes): @@ -87,7 +91,7 @@ def _align_core(terms): ti = terms[i].value if hasattr(ti, "reindex"): - transpose = isinstance(ti, pd.Series) and naxes > 1 + transpose = isinstance(ti, ABCSeries) and naxes > 1 reindexer = axes[naxes - 1] if transpose else items term_axis_size = len(ti.axes[axis]) @@ -111,28 +115,28 @@ def _align_core(terms): return typ, _zip_axes_from_type(typ, axes) -def _align(terms): +def align_terms(terms): """Align a set of terms""" try: # flatten the parse tree (a nested list, really) terms = list(com.flatten(terms)) except TypeError: # can't iterate so it must just be a constant or single variable - if isinstance(terms.value, pd.core.generic.NDFrame): + if isinstance(terms.value, (ABCSeries, ABCDataFrame)): typ = type(terms.value) return typ, _zip_axes_from_type(typ, terms.value.axes) return np.result_type(terms.type), None # if all resolved variables are numeric scalars if all(term.is_scalar for term in terms): - return _result_type_many(*(term.value for term in terms)).type, None + return result_type_many(*(term.value for term in terms)).type, None # perform the main alignment typ, axes = _align_core(terms) return typ, axes -def _reconstruct_object(typ, obj, axes, dtype): +def reconstruct_object(typ, obj, axes, dtype): """ Reconstruct an object given its type, raw value, and possibly empty (None) axes. diff --git a/pandas/core/computation/common.py b/pandas/core/computation/common.py index bd32c8bee1cdf..da47449d5e62e 100644 --- a/pandas/core/computation/common.py +++ b/pandas/core/computation/common.py @@ -15,7 +15,7 @@ def _ensure_decoded(s): return s -def _result_type_many(*arrays_and_dtypes): +def result_type_many(*arrays_and_dtypes): """ wrapper around numpy.result_type which overcomes the NPY_MAXARGS (32) argument limit """ try: diff --git a/pandas/core/computation/engines.py b/pandas/core/computation/engines.py index 513eb0fd7f2a6..2f3c519d352c6 100644 --- a/pandas/core/computation/engines.py +++ b/pandas/core/computation/engines.py @@ -4,7 +4,7 @@ import abc -from pandas.core.computation.align import _align, _reconstruct_object +from pandas.core.computation.align import align_terms, reconstruct_object from pandas.core.computation.ops import UndefinedVariableError, _mathops, _reductions import pandas.io.formats.printing as printing @@ -67,11 +67,11 @@ def evaluate(self): The result of the passed expression. """ if not self._is_aligned: - self.result_type, self.aligned_axes = _align(self.expr.terms) + self.result_type, self.aligned_axes = align_terms(self.expr.terms) # make sure no names in resolvers and locals/globals clash res = self._evaluate() - return _reconstruct_object( + return reconstruct_object( self.result_type, res, self.aligned_axes, self.expr.terms.return_type ) diff --git a/pandas/core/computation/eval.py b/pandas/core/computation/eval.py index 461561a80a7e5..de2133f64291d 100644 --- a/pandas/core/computation/eval.py +++ b/pandas/core/computation/eval.py @@ -10,6 +10,7 @@ from pandas.util._validators import validate_bool_kwarg from pandas.core.computation.engines import _engines +from pandas.core.computation.expr import Expr, _parsers, tokenize_string from pandas.core.computation.scope import _ensure_scope from pandas.io.formats.printing import pprint_thing @@ -64,7 +65,7 @@ def _check_engine(engine): return engine -def _check_parser(parser): +def _check_parser(parser: str): """ Make sure a valid parser is passed. @@ -77,7 +78,6 @@ def _check_parser(parser): KeyError * If an invalid parser is passed """ - from pandas.core.computation.expr import _parsers if parser not in _parsers: raise KeyError( @@ -115,7 +115,7 @@ def _check_expression(expr): raise ValueError("expr cannot be an empty string") -def _convert_expression(expr): +def _convert_expression(expr) -> str: """ Convert an object to an expression. @@ -131,7 +131,7 @@ def _convert_expression(expr): Returns ------- - s : unicode + str The string representation of an object. Raises @@ -144,8 +144,7 @@ def _convert_expression(expr): return s -def _check_for_locals(expr, stack_level, parser): - from pandas.core.computation.expr import tokenize_string +def _check_for_locals(expr: str, stack_level: int, parser: str): at_top_of_stack = stack_level == 0 not_pandas_parser = parser != "pandas" @@ -192,7 +191,7 @@ def eval( Parameters ---------- - expr : str or unicode + expr : str The expression to evaluate. This string cannot contain any Python `statements `__, @@ -282,7 +281,6 @@ def eval( See the :ref:`enhancing performance ` documentation for more details. """ - from pandas.core.computation.expr import Expr inplace = validate_bool_kwarg(inplace, "inplace") diff --git a/pandas/core/computation/expr.py b/pandas/core/computation/expr.py index 929c9e69d56ac..4d1fc42070ea8 100644 --- a/pandas/core/computation/expr.py +++ b/pandas/core/computation/expr.py @@ -11,7 +11,6 @@ import numpy as np -import pandas as pd import pandas.core.common as com from pandas.core.computation.common import ( _BACKTICK_QUOTED_STRING, @@ -40,7 +39,7 @@ import pandas.io.formats.printing as printing -def tokenize_string(source): +def tokenize_string(source: str): """ Tokenize a Python source code string. @@ -171,7 +170,7 @@ def _compose(*funcs): def _preparse( - source, + source: str, f=_compose( _replace_locals, _replace_booleans, @@ -600,6 +599,8 @@ def visit_Index(self, node, **kwargs): return self.visit(node.value) def visit_Subscript(self, node, **kwargs): + import pandas as pd + value = self.visit(node.value) slobj = self.visit(node.slice) result = pd.eval( diff --git a/pandas/core/computation/ops.py b/pandas/core/computation/ops.py index 0fdbdda30ad35..ce67c3152ecd0 100644 --- a/pandas/core/computation/ops.py +++ b/pandas/core/computation/ops.py @@ -13,7 +13,7 @@ from pandas.core.dtypes.common import is_list_like, is_scalar import pandas.core.common as com -from pandas.core.computation.common import _ensure_decoded, _result_type_many +from pandas.core.computation.common import _ensure_decoded, result_type_many from pandas.core.computation.scope import _DEFAULT_GLOBALS from pandas.io.formats.printing import pprint_thing, pprint_thing_encoded @@ -218,7 +218,7 @@ def return_type(self): # clobber types to bool if the op is a boolean operator if self.op in (_cmp_ops_syms + _bool_ops_syms): return np.bool_ - return _result_type_many(*(term.type for term in com.flatten(self))) + return result_type_many(*(term.type for term in com.flatten(self))) @property def has_invalid_return_type(self) -> bool: From cb6859249d93dee7231d56bb3c2eb103ba2d97d0 Mon Sep 17 00:00:00 2001 From: Marc Garcia Date: Sun, 17 Nov 2019 14:06:55 +0000 Subject: [PATCH 084/185] CI: Adding GitHub action to assign issues based on comment (#29648) --- .github/workflows/assign.yml | 15 +++++++++++++++ doc/source/development/contributing.rst | 21 +++++++++++++++++++++ 2 files changed, 36 insertions(+) create mode 100644 .github/workflows/assign.yml diff --git a/.github/workflows/assign.yml b/.github/workflows/assign.yml new file mode 100644 index 0000000000000..019ecfc484ca5 --- /dev/null +++ b/.github/workflows/assign.yml @@ -0,0 +1,15 @@ +name: Assign +on: + issue_comment: + types: created + +jobs: + one: + runs-on: ubuntu-latest + steps: + - name: + run: | + if [[ "${{ github.event.comment.body }}" == "take" ]]; then + echo "Assigning issue ${{ github.event.issue.number }} to ${{ github.event.comment.user.login }}" + curl -H "Authorization: token ${{ secrets.GITHUB_TOKEN }}" -d '{"assignees": ["${{ github.event.comment.user.login }}"]}' https://api.github.com/repos/${{ github.repository }}/issues/${{ github.event.issue.number }}/assignees + fi diff --git a/doc/source/development/contributing.rst b/doc/source/development/contributing.rst index 8fe5b174c77d3..553b167034350 100644 --- a/doc/source/development/contributing.rst +++ b/doc/source/development/contributing.rst @@ -24,6 +24,27 @@ and `good first issue where you could start out. Once you've found an interesting issue, you can return here to get your development environment setup. +When you start working on an issue, it's a good idea to assign the issue to yourself, +so nobody else duplicates the work on it. GitHub restricts assigning issues to maintainers +of the project only. In most projects, and until recently in pandas, contributors added a +comment letting others know they are working on an issue. While this is ok, you need to +check each issue individually, and it's not possible to find the unassigned ones. + +For this reason, we implemented a workaround consisting of adding a comment with the exact +text `take`. When you do it, a GitHub action will automatically assign you the issue +(this will take seconds, and may require refreshint the page to see it). +By doing this, it's possible to filter the list of issues and find only the unassigned ones. + +So, a good way to find an issue to start contributing to pandas is to check the list of +`unassigned good first issues `_ +and assign yourself one you like by writing a comment with the exact text `take`. + +If for whatever reason you are not able to continue working with the issue, please try to +unassign it, so other people know it's available again. You can check the list of +assigned issues, since people may not be working in them anymore. If you want to work on one +that is assigned, feel free to kindly ask the current assignee if you can take it +(please allow at least a week of inactivity before considering work in the issue discontinued). + Feel free to ask questions on the `mailing list `_ or on `Gitter`_. From 940ea476f9af1a9c220887f6ac47b52dc97e5f3f Mon Sep 17 00:00:00 2001 From: Marc Garcia Date: Sun, 17 Nov 2019 14:20:07 +0000 Subject: [PATCH 085/185] TST: Call tests just once with --dist=loadscope (#28531) --- .travis.yml | 9 --------- ci/azure/posix.yml | 29 ++++++----------------------- ci/print_skipped.py | 19 +++++++++---------- ci/run_tests.sh | 42 +++++++++++++++++------------------------- environment.yml | 2 +- requirements-dev.txt | 2 +- 6 files changed, 34 insertions(+), 69 deletions(-) diff --git a/.travis.yml b/.travis.yml index 398dd07089ef9..048736e4bf1d0 100644 --- a/.travis.yml +++ b/.travis.yml @@ -85,15 +85,6 @@ install: - ci/submit_cython_cache.sh - echo "install done" - -before_script: - # display server (for clipboard functionality) needs to be started here, - # does not work if done in install:setup_env.sh (GH-26103) - - export DISPLAY=":99.0" - - echo "sh -e /etc/init.d/xvfb start" - - if [ "$JOB" != "3.8-dev" ]; then sh -e /etc/init.d/xvfb start; fi - - sleep 3 - script: - echo "script start" - echo "$JOB" diff --git a/ci/azure/posix.yml b/ci/azure/posix.yml index d6afb263b447f..66960ca2c6c10 100644 --- a/ci/azure/posix.yml +++ b/ci/azure/posix.yml @@ -73,33 +73,16 @@ jobs: - task: PublishTestResults@2 inputs: - testResultsFiles: 'test-data-*.xml' + testResultsFiles: 'test-data.xml' testRunTitle: ${{ format('{0}-$(CONDA_PY)', parameters.name) }} displayName: 'Publish test results' - powershell: | - $junitXml = "test-data-single.xml" - $(Get-Content $junitXml | Out-String) -match 'failures="(.*?)"' - if ($matches[1] -eq 0) - { - Write-Host "No test failures in test-data-single" - } - else - { - # note that this will produce $LASTEXITCODE=1 - Write-Error "$($matches[1]) tests failed" - } - - $junitXmlMulti = "test-data-multiple.xml" - $(Get-Content $junitXmlMulti | Out-String) -match 'failures="(.*?)"' - if ($matches[1] -eq 0) - { - Write-Host "No test failures in test-data-multi" - } - else - { - # note that this will produce $LASTEXITCODE=1 - Write-Error "$($matches[1]) tests failed" + $(Get-Content "test-data.xml" | Out-String) -match 'failures="(.*?)"' + if ($matches[1] -eq 0) { + Write-Host "No test failures in test-data" + } else { + Write-Error "$($matches[1]) tests failed" # will produce $LASTEXITCODE=1 } displayName: 'Check for test failures' diff --git a/ci/print_skipped.py b/ci/print_skipped.py index e99e789a71fe8..51a2460e05fab 100755 --- a/ci/print_skipped.py +++ b/ci/print_skipped.py @@ -27,14 +27,13 @@ def main(filename): if __name__ == "__main__": print("SKIPPED TESTS:") i = 1 - for file_type in ("-single", "-multiple", ""): - for test_data in main("test-data{}.xml".format(file_type)): - if test_data is None: - print("-" * 80) - else: - print( - "#{i} {class_name}.{test_name}: {message}".format( - **dict(test_data, i=i) - ) + for test_data in main("test-data.xml"): + if test_data is None: + print("-" * 80) + else: + print( + "#{i} {class_name}.{test_name}: {message}".format( + **dict(test_data, i=i) ) - i += 1 + ) + i += 1 diff --git a/ci/run_tests.sh b/ci/run_tests.sh index d1a9447c97d4e..b91cfb3bed8cc 100755 --- a/ci/run_tests.sh +++ b/ci/run_tests.sh @@ -15,37 +15,29 @@ if [ -n "$LOCALE_OVERRIDE" ]; then # exit 1 fi fi + if [[ "not network" == *"$PATTERN"* ]]; then export http_proxy=http://1.2.3.4 https_proxy=http://1.2.3.4; fi - -if [ -n "$PATTERN" ]; then - PATTERN=" and $PATTERN" +if [ "$COVERAGE" ]; then + COVERAGE_FNAME="/tmp/test_coverage.xml" + COVERAGE="-s --cov=pandas --cov-report=xml:$COVERAGE_FNAME" fi -for TYPE in single multiple -do - if [ "$COVERAGE" ]; then - COVERAGE_FNAME="/tmp/coc-$TYPE.xml" - COVERAGE="-s --cov=pandas --cov-report=xml:$COVERAGE_FNAME" - fi +PYTEST_CMD="pytest -m \"$PATTERN\" -n auto --dist=loadfile -s --strict --durations=10 --junitxml=test-data.xml $TEST_ARGS $COVERAGE pandas" - TYPE_PATTERN=$TYPE - NUM_JOBS=1 - if [[ "$TYPE_PATTERN" == "multiple" ]]; then - TYPE_PATTERN="not single" - NUM_JOBS=2 - fi +# Travis does not have have an X server +if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then + DISPLAY=DISPLAY=:99.0 + PYTEST_CMD="xvfb-run -e /dev/stdout $PYTEST_CMD" +fi - PYTEST_CMD="pytest -m \"$TYPE_PATTERN$PATTERN\" -n $NUM_JOBS -s --strict --durations=10 --junitxml=test-data-$TYPE.xml $TEST_ARGS $COVERAGE pandas" - echo $PYTEST_CMD - # if no tests are found (the case of "single and slow"), pytest exits with code 5, and would make the script fail, if not for the below code - sh -c "$PYTEST_CMD; ret=\$?; [ \$ret = 5 ] && exit 0 || exit \$ret" +echo $PYTEST_CMD +sh -c "$PYTEST_CMD" - if [[ "$COVERAGE" && $? == 0 && "$TRAVIS_BRANCH" == "master" ]]; then - echo "uploading coverage for $TYPE tests" - echo "bash <(curl -s https://codecov.io/bash) -Z -c -F $TYPE -f $COVERAGE_FNAME" - bash <(curl -s https://codecov.io/bash) -Z -c -F $TYPE -f $COVERAGE_FNAME - fi -done +if [[ "$COVERAGE" && $? == 0 && "$TRAVIS_BRANCH" == "master" ]]; then + echo "uploading coverage" + echo "bash <(curl -s https://codecov.io/bash) -Z -c -F $TYPE -f $COVERAGE_FNAME" + bash <(curl -s https://codecov.io/bash) -Z -c -F $TYPE -f $COVERAGE_FNAME +fi diff --git a/environment.yml b/environment.yml index a3582c56ee9d2..bbf3c036f65c4 100644 --- a/environment.yml +++ b/environment.yml @@ -53,7 +53,7 @@ dependencies: - moto # mock S3 - pytest>=4.0.2 - pytest-cov - - pytest-xdist + - pytest-xdist>=1.21 - seaborn - statsmodels diff --git a/requirements-dev.txt b/requirements-dev.txt index 6235b61d92f29..5633a58f254ca 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -32,7 +32,7 @@ hypothesis>=3.82 moto pytest>=4.0.2 pytest-cov -pytest-xdist +pytest-xdist>=1.21 seaborn statsmodels ipywidgets From 86d49f45ffc7dbd4453e375bf2b2772664977220 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Sun, 17 Nov 2019 08:25:04 -0600 Subject: [PATCH 086/185] Triage guide (#29616) --- doc/source/development/index.rst | 1 + doc/source/development/maintaining.rst | 193 +++++++++++++++++++++++++ 2 files changed, 194 insertions(+) create mode 100644 doc/source/development/maintaining.rst diff --git a/doc/source/development/index.rst b/doc/source/development/index.rst index 1228f00667f3a..a523ae0c957f1 100644 --- a/doc/source/development/index.rst +++ b/doc/source/development/index.rst @@ -13,6 +13,7 @@ Development :maxdepth: 2 contributing + maintaining internals extending developer diff --git a/doc/source/development/maintaining.rst b/doc/source/development/maintaining.rst new file mode 100644 index 0000000000000..0d1088cc8a6ca --- /dev/null +++ b/doc/source/development/maintaining.rst @@ -0,0 +1,193 @@ +.. _maintaining: + +****************** +Pandas Maintenance +****************** + +This guide is for pandas' maintainers. It may also be interesting to contributors +looking to understand the pandas development process and what steps are necessary +to become a maintainer. + +The main contributing guide is available at :ref:`contributing`. + +Roles +----- + +Pandas uses two levels of permissions: **triage** and **core** team members. + +Triage members can label and close issues and pull requests. + +Core team members can label and close issues and pull request, and can merge +pull requests. + +GitHub publishes the full `list of permissions`_. + +Tasks +----- + +Pandas is largely a volunteer project, so these tasks shouldn't be read as +"expectations" of triage and maintainers. Rather, they're general descriptions +of what it means to be a maintainer. + +* Triage newly filed issues (see :ref:`maintaining.triage`) +* Review newly opened pull requests +* Respond to updates on existing issues and pull requests +* Drive discussion and decisions on stalled issues and pull requests +* Provide experience / wisdom on API design questions to ensure consistency and maintainability +* Project organization (run / attend developer meetings, represent pandas) + +http://matthewrocklin.com/blog/2019/05/18/maintainer may be interesting background +reading. + +.. _maintaining.triage: + +Issue Triage +------------ + + +Here's a typical workflow for triaging a newly opened issue. + +1. **Thank the reporter for opening an issue** + + The issue tracker is many people's first interaction with the pandas project itself, + beyond just using the library. As such, we want it to be a welcoming, pleasant + experience. + +2. **Is the necessary information provided?** + + Ideally reporters would fill out the issue template, but many don't. + If crucial information (like the version of pandas they used), is missing + feel free to ask for that and label the issue with "Needs info". The + report should follow the guidelines in :ref:`contributing.bug_reports`. + You may want to link to that if they didn't follow the template. + + Make sure that the title accurately reflects the issue. Edit it yourself + if it's not clear. + +3. **Is this a duplicate issue?** + + We have many open issues. If a new issue is clearly a duplicate, label the + new issue as "Duplicate" assign the milestone "No Action", and close the issue + with a link to the original issue. Make sure to still thank the reporter, and + encourage them to chime in on the original issue, and perhaps try to fix it. + + If the new issue provides relevant information, such as a better or slightly + different example, add it to the original issue as a comment or an edit to + the original post. + +4. **Is the issue minimal and reproducible**? + + For bug reports, we ask that the reporter provide a minimal reproducible + example. See http://matthewrocklin.com/blog/work/2018/02/28/minimal-bug-reports + for a good explanation. If the example is not reproducible, or if it's + *clearly* not minimal, feel free to ask the reporter if they can provide + and example or simplify the provided one. Do acknowledge that writing + minimal reproducible examples is hard work. If the reporter is struggling, + you can try to write one yourself and we'll edit the original post to include it. + + If a reproducible example can't be provided, add the "Needs info" label. + + If a reproducible example is provided, but you see a simplification, + edit the original post with your simpler reproducible example. + +5. **Is this a clearly defined feature request?** + + Generally, pandas prefers to discuss and design new features in issues, before + a pull request is made. Encourage the submitter to include a proposed API + for the new feature. Having them write a full docstring is a good way to + pin down specifics. + + We'll need a discussion from several pandas maintainers before deciding whether + the proposal is in scope for pandas. + +6. **Is this a usage question?** + + We prefer that usage questions are asked on StackOverflow with the pandas + tag. https://stackoverflow.com/questions/tagged/pandas + + If it's easy to answer, feel free to link to the relevant documentation section, + let them know that in the future this kind of question should be on + StackOverflow, and close the issue. + +7. **What labels and milestones should I add?** + + Apply the relevant labels. This is a bit of an art, and comes with experience. + Look at similar issues to get a feel for how things are labeled. + + If the issue is clearly defined and the fix seems relatively straightforward, + label the issue as "Good first issue". + + Typically, new issues will be assigned the "Contributions welcome" milestone, + unless it's know that this issue should be addressed in a specific release (say + because it's a large regression). + +.. _maintaining.closing: + +Closing Issues +-------------- + +Be delicate here: many people interpret closing an issue as us saying that the +conversation is over. It's typically best to give the reporter some time to +respond or self-close their issue if it's determined that the behavior is not a bug, +or the feature is out of scope. Sometimes reporters just go away though, and +we'll close the issue after the conversation has died. + +Reviewing Pull Requests +----------------------- + +Anybody can review a pull request: regular contributors, triagers, or core-team +members. Here are some guidelines to check. + +* Tests should be in a sensible location. +* New public APIs should be included somewhere in ``doc/source/reference/``. +* New / changed API should use the ``versionadded`` or ``versionchanged`` directives in the docstring. +* User-facing changes should have a whatsnew in the appropriate file. +* Regression tests should reference the original GitHub issue number like ``# GH-1234``. + +Cleaning up old Issues +---------------------- + +Every open issue in pandas has a cost. Open issues make finding duplicates harder, +and can make it harder to know what needs to be done in pandas. That said, closing +issues isn't a goal on its own. Our goal is to make pandas the best it can be, +and that's best done by ensuring that the quality of our open issues is high. + +Occasionally, bugs are fixed but the issue isn't linked to in the Pull Request. +In these cases, comment that "This has been fixed, but could use a test." and +label the issue as "Good First Issue" and "Needs Test". + +If an older issue doesn't follow our issue template, edit the original post to +include a minimal example, the actual output, and the expected output. Uniformity +in issue reports is valuable. + +If an older issue lacks a reproducible example, label it as "Needs Info" and +ask them to provide one (or write one yourself if possible). If one isn't +provide reasonably soon, close it according to the policies in :ref:`maintaining.closing`. + +Cleaning up old Pull Requests +----------------------------- + +Occasionally, contributors are unable to finish off a pull request. +If some time has passed (two weeks, say) since the last review requesting changes, +gently ask if they're still interested in working on this. If another two weeks or +so passes with no response, thank them for their work and close the pull request. +Comment on the original issue that "There's a stalled PR at #1234 that may be +helpful.", and perhaps label the issue as "Good first issue" if the PR was relatively +close to being accepted. + +Additionally, core-team members can push to contributors branches. This can be +helpful for pushing an important PR across the line, or for fixing a small +merge conflict. + +Becoming a pandas maintainer +---------------------------- + +The full process is outlined in our `governance documents`_. In summary, +we're happy to give triage permissions to anyone who shows interest by +being helpful on the issue tracker. + +The current list of core-team members is at +https://github.com/pandas-dev/pandas-governance/blob/master/people.md + +.. _governance documents: https://github.com/pandas-dev/pandas-governance +.. _list of permissions: https://help.github.com/en/github/setting-up-and-managing-organizations-and-teams/repository-permission-levels-for-an-organization \ No newline at end of file From 054936f5f9450bc4365c1e6f635cf0a377c3461e Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Sun, 17 Nov 2019 14:27:18 +0000 Subject: [PATCH 087/185] CI: bump mypy 0.730 (#29653) --- environment.yml | 2 +- requirements-dev.txt | 2 +- setup.cfg | 5 ++++- 3 files changed, 6 insertions(+), 3 deletions(-) diff --git a/environment.yml b/environment.yml index bbf3c036f65c4..9df4b4e8a371f 100644 --- a/environment.yml +++ b/environment.yml @@ -21,7 +21,7 @@ dependencies: - flake8-comprehensions>=3.1.0 # used by flake8, linting of unnecessary comprehensions - flake8-rst>=0.6.0,<=0.7.0 # linting of code blocks in rst files - isort # check that imports are in the right order - - mypy=0.720 + - mypy=0.730 - pycodestyle # used by flake8 # documentation diff --git a/requirements-dev.txt b/requirements-dev.txt index 5633a58f254ca..33f4e057c3dc9 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -9,7 +9,7 @@ flake8 flake8-comprehensions>=3.1.0 flake8-rst>=0.6.0,<=0.7.0 isort -mypy==0.720 +mypy==0.730 pycodestyle gitpython sphinx diff --git a/setup.cfg b/setup.cfg index 10670a4eae387..46e6b88f8018a 100644 --- a/setup.cfg +++ b/setup.cfg @@ -145,10 +145,13 @@ ignore_errors=True [mypy-pandas.tests.extension.json.test_json] ignore_errors=True +[mypy-pandas.tests.indexes.datetimes.test_tools] +ignore_errors=True + [mypy-pandas.tests.indexes.test_base] ignore_errors=True -[mypy-pandas.tests.indexing.test_loc] +[mypy-pandas.tests.scalar.period.test_period] ignore_errors=True [mypy-pandas.tests.series.test_operators] From c79fc046c8dcb6810202b6dbb5ff41e0e56d685e Mon Sep 17 00:00:00 2001 From: alimcmaster1 Date: Sun, 17 Nov 2019 15:42:11 +0000 Subject: [PATCH 088/185] Use black 19.10b0 (#29508) --- .pre-commit-config.yaml | 2 +- doc/source/development/contributing.rst | 3 +++ environment.yml | 2 +- pandas/core/algorithms.py | 2 +- pandas/core/frame.py | 2 +- pandas/core/generic.py | 2 +- pandas/core/groupby/grouper.py | 4 +--- pandas/core/indexes/base.py | 3 +-- pandas/core/indexing.py | 6 +++--- pandas/core/internals/managers.py | 2 +- pandas/io/parsers.py | 14 ++++++++++++-- pandas/io/stata.py | 2 +- pandas/tests/arrays/sparse/test_array.py | 12 ++++++++---- pandas/tests/dtypes/test_inference.py | 2 +- pandas/tests/frame/test_constructors.py | 6 +++--- pandas/tests/indexes/period/test_construction.py | 2 +- pandas/tests/indexing/multiindex/test_getitem.py | 2 +- pandas/tests/indexing/multiindex/test_xs.py | 2 +- pandas/tests/indexing/test_callable.py | 12 +++++++++--- pandas/tests/io/parser/test_index_col.py | 4 +--- pandas/tests/reductions/test_reductions.py | 4 ++-- pandas/tests/test_algos.py | 10 +++++----- pandas/tests/test_nanops.py | 2 +- requirements-dev.txt | 2 +- 24 files changed, 61 insertions(+), 43 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 3bed68fd8d2fc..b34f5dfdd1a83 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,6 +1,6 @@ repos: - repo: https://github.com/python/black - rev: stable + rev: 19.10b0 hooks: - id: black language_version: python3.7 diff --git a/doc/source/development/contributing.rst b/doc/source/development/contributing.rst index 553b167034350..33084d0d23771 100644 --- a/doc/source/development/contributing.rst +++ b/doc/source/development/contributing.rst @@ -654,6 +654,9 @@ submitting code to run the check yourself:: to auto-format your code. Additionally, many editors have plugins that will apply ``black`` as you edit files. +You should use a ``black`` version >= 19.10b0 as previous versions are not compatible +with the pandas codebase. + Optionally, you may wish to setup `pre-commit hooks `_ to automatically run ``black`` and ``flake8`` when you make a git commit. This can be done by installing ``pre-commit``:: diff --git a/environment.yml b/environment.yml index 9df4b4e8a371f..325b79f07a61c 100644 --- a/environment.yml +++ b/environment.yml @@ -15,7 +15,7 @@ dependencies: - cython>=0.29.13 # code checks - - black<=19.3b0 + - black>=19.10b0 - cpplint - flake8 - flake8-comprehensions>=3.1.0 # used by flake8, linting of unnecessary comprehensions diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index b49a9d7957d51..ea75d46048e63 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -1159,7 +1159,7 @@ def compute(self, method): n = min(n, narr) kth_val = algos.kth_smallest(arr.copy(), n - 1) - ns, = np.nonzero(arr <= kth_val) + (ns,) = np.nonzero(arr <= kth_val) inds = ns[arr[ns].argsort(kind="mergesort")] if self.keep != "all": diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 442994a04caee..fae5a6b549af7 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -4774,7 +4774,7 @@ def drop_duplicates(self, subset=None, keep="first", inplace=False): duplicated = self.duplicated(subset, keep=keep) if inplace: - inds, = (-duplicated)._ndarray_values.nonzero() + (inds,) = (-duplicated)._ndarray_values.nonzero() new_data = self._data.take(inds) self._update_inplace(new_data) else: diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 17784b623c414..d76c870d6227e 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -3599,7 +3599,7 @@ class animal locomotion if isinstance(loc, np.ndarray): if loc.dtype == np.bool_: - inds, = loc.nonzero() + (inds,) = loc.nonzero() return self.take(inds, axis=axis) else: return self.take(loc, axis=axis) diff --git a/pandas/core/groupby/grouper.py b/pandas/core/groupby/grouper.py index 0edc3e4a4ff3d..eb1442aeb8a4c 100644 --- a/pandas/core/groupby/grouper.py +++ b/pandas/core/groupby/grouper.py @@ -292,9 +292,7 @@ def __init__( self.grouper, self._codes, self._group_index, - ) = index._get_grouper_for_level( # noqa: E501 - self.grouper, level - ) + ) = index._get_grouper_for_level(self.grouper, level) # a passed Grouper like, directly get the grouper in the same way # as single grouper groupby, use the group_info to get codes diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index a8c7100b3846a..c554c501ae7ce 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -1871,8 +1871,7 @@ def _isnan(self): @cache_readonly def _nan_idxs(self): if self._can_hold_na: - w = self._isnan.nonzero()[0] - return w + return self._isnan.nonzero()[0] else: return np.array([], dtype=np.int64) diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index 7db54f4305c2e..673764ef6a124 100755 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -319,7 +319,7 @@ def _setitem_with_indexer(self, indexer, value): # if there is only one block/type, still have to take split path # unless the block is one-dimensional or it can hold the value if not take_split_path and self.obj._data.blocks: - blk, = self.obj._data.blocks + (blk,) = self.obj._data.blocks if 1 < blk.ndim: # in case of dict, keys are indices val = list(value.values()) if isinstance(value, dict) else value take_split_path = not blk._can_hold_element(val) @@ -1111,7 +1111,7 @@ def _getitem_iterable(self, key, axis: int): if com.is_bool_indexer(key): # A boolean indexer key = check_bool_indexer(labels, key) - inds, = key.nonzero() + (inds,) = key.nonzero() return self.obj.take(inds, axis=axis) else: # A collection of keys @@ -1255,7 +1255,7 @@ def _convert_to_indexer(self, obj, axis: int, raise_missing: bool = False): if com.is_bool_indexer(obj): obj = check_bool_indexer(labels, obj) - inds, = obj.nonzero() + (inds,) = obj.nonzero() return inds else: # When setting, missing keys are not allowed, even with .loc: diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index d92167f8a3b19..8a9410c076f9b 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -1860,7 +1860,7 @@ def _shape_compat(x): def _interleaved_dtype( - blocks: List[Block] + blocks: List[Block], ) -> Optional[Union[np.dtype, ExtensionDtype]]: """Find the common dtype for `blocks`. diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index d9e505f0b30cd..2cb4a5c8bb2f6 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -1918,7 +1918,12 @@ def __init__(self, src, **kwds): else: if len(self._reader.header) > 1: # we have a multi index in the columns - self.names, self.index_names, self.col_names, passed_names = self._extract_multi_indexer_columns( # noqa: E501 + ( + self.names, + self.index_names, + self.col_names, + passed_names, + ) = self._extract_multi_indexer_columns( self._reader.header, self.index_names, self.col_names, passed_names ) else: @@ -2307,7 +2312,12 @@ def __init__(self, f, **kwds): # The original set is stored in self.original_columns. if len(self.columns) > 1: # we are processing a multi index column - self.columns, self.index_names, self.col_names, _ = self._extract_multi_indexer_columns( # noqa: E501 + ( + self.columns, + self.index_names, + self.col_names, + _, + ) = self._extract_multi_indexer_columns( self.columns, self.index_names, self.col_names ) # Update list of original names to include all indices. diff --git a/pandas/io/stata.py b/pandas/io/stata.py index d51c9170c488b..d970f2819c3c1 100644 --- a/pandas/io/stata.py +++ b/pandas/io/stata.py @@ -614,7 +614,7 @@ def _cast_to_stata_types(data): data[col] = data[col].astype(np.int32) else: data[col] = data[col].astype(np.float64) - if data[col].max() >= 2 ** 53 or data[col].min() <= -2 ** 53: + if data[col].max() >= 2 ** 53 or data[col].min() <= -(2 ** 53): ws = precision_loss_doc % ("int64", "float64") elif dtype in (np.float32, np.float64): value = data[col].max() diff --git a/pandas/tests/arrays/sparse/test_array.py b/pandas/tests/arrays/sparse/test_array.py index f9bb4981df7df..755cbfb716fcd 100644 --- a/pandas/tests/arrays/sparse/test_array.py +++ b/pandas/tests/arrays/sparse/test_array.py @@ -658,12 +658,16 @@ def test_getslice_tuple(self): dense = np.array([np.nan, 0, 3, 4, 0, 5, np.nan, np.nan, 0]) sparse = SparseArray(dense) - res = sparse[4:,] # noqa: E231 + res = sparse[ + 4:, + ] # noqa: E231 exp = SparseArray(dense[4:,]) # noqa: E231 tm.assert_sp_array_equal(res, exp) sparse = SparseArray(dense, fill_value=0) - res = sparse[4:,] # noqa: E231 + res = sparse[ + 4:, + ] # noqa: E231 exp = SparseArray(dense[4:,], fill_value=0) # noqa: E231 tm.assert_sp_array_equal(res, exp) @@ -823,11 +827,11 @@ def test_nonzero(self): # Tests regression #21172. sa = pd.SparseArray([float("nan"), float("nan"), 1, 0, 0, 2, 0, 0, 0, 3, 0, 0]) expected = np.array([2, 5, 9], dtype=np.int32) - result, = sa.nonzero() + (result,) = sa.nonzero() tm.assert_numpy_array_equal(expected, result) sa = pd.SparseArray([0, 0, 1, 0, 0, 2, 0, 0, 0, 3, 0, 0]) - result, = sa.nonzero() + (result,) = sa.nonzero() tm.assert_numpy_array_equal(expected, result) diff --git a/pandas/tests/dtypes/test_inference.py b/pandas/tests/dtypes/test_inference.py index 0408c78ac1536..743b844917edf 100644 --- a/pandas/tests/dtypes/test_inference.py +++ b/pandas/tests/dtypes/test_inference.py @@ -506,7 +506,7 @@ def test_convert_numeric_int64_uint64(self, case, coerce): result = lib.maybe_convert_numeric(case, set(), coerce_numeric=coerce) tm.assert_almost_equal(result, expected) - @pytest.mark.parametrize("value", [-2 ** 63 - 1, 2 ** 64]) + @pytest.mark.parametrize("value", [-(2 ** 63) - 1, 2 ** 64]) def test_convert_int_overflow(self, value): # see gh-18584 arr = np.array([value], dtype=object) diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py index 77a7783deeee3..cccce96a874dd 100644 --- a/pandas/tests/frame/test_constructors.py +++ b/pandas/tests/frame/test_constructors.py @@ -245,9 +245,9 @@ def test_constructor_overflow_int64(self): np.array([2 ** 64], dtype=object), np.array([2 ** 65]), [2 ** 64 + 1], - np.array([-2 ** 63 - 4], dtype=object), - np.array([-2 ** 64 - 1]), - [-2 ** 65 - 2], + np.array([-(2 ** 63) - 4], dtype=object), + np.array([-(2 ** 64) - 1]), + [-(2 ** 65) - 2], ], ) def test_constructor_int_overflow(self, values): diff --git a/pandas/tests/indexes/period/test_construction.py b/pandas/tests/indexes/period/test_construction.py index 8c75fbbae7de3..1973cb7f4740d 100644 --- a/pandas/tests/indexes/period/test_construction.py +++ b/pandas/tests/indexes/period/test_construction.py @@ -434,7 +434,7 @@ def test_constructor_range_based_deprecated_different_freq(self): with tm.assert_produces_warning(FutureWarning) as m: PeriodIndex(start="2000", periods=2) - warning, = m + (warning,) = m assert 'freq="A-DEC"' in str(warning.message) def test_constructor(self): diff --git a/pandas/tests/indexing/multiindex/test_getitem.py b/pandas/tests/indexing/multiindex/test_getitem.py index 4f95e6bd28989..519a1eb5b16d8 100644 --- a/pandas/tests/indexing/multiindex/test_getitem.py +++ b/pandas/tests/indexing/multiindex/test_getitem.py @@ -108,7 +108,7 @@ def test_series_getitem_indexing_errors( def test_series_getitem_corner_generator( - multiindex_year_month_day_dataframe_random_data + multiindex_year_month_day_dataframe_random_data, ): s = multiindex_year_month_day_dataframe_random_data["A"] result = s[(x > 0 for x in s)] diff --git a/pandas/tests/indexing/multiindex/test_xs.py b/pandas/tests/indexing/multiindex/test_xs.py index c81712b1e0496..ffbe1bb785cda 100644 --- a/pandas/tests/indexing/multiindex/test_xs.py +++ b/pandas/tests/indexing/multiindex/test_xs.py @@ -207,7 +207,7 @@ def test_xs_level_series_ymd(multiindex_year_month_day_dataframe_random_data): def test_xs_level_series_slice_not_implemented( - multiindex_year_month_day_dataframe_random_data + multiindex_year_month_day_dataframe_random_data, ): # this test is not explicitly testing .xs functionality # TODO: move to another module or refactor diff --git a/pandas/tests/indexing/test_callable.py b/pandas/tests/indexing/test_callable.py index aa73bd728595f..81dedfdc74409 100644 --- a/pandas/tests/indexing/test_callable.py +++ b/pandas/tests/indexing/test_callable.py @@ -17,10 +17,14 @@ def test_frame_loc_callable(self): res = df.loc[lambda x: x.A > 2] tm.assert_frame_equal(res, df.loc[df.A > 2]) - res = df.loc[lambda x: x.A > 2,] # noqa: E231 + res = df.loc[ + lambda x: x.A > 2, + ] # noqa: E231 tm.assert_frame_equal(res, df.loc[df.A > 2,]) # noqa: E231 - res = df.loc[lambda x: x.A > 2,] # noqa: E231 + res = df.loc[ + lambda x: x.A > 2, + ] # noqa: E231 tm.assert_frame_equal(res, df.loc[df.A > 2,]) # noqa: E231 res = df.loc[lambda x: x.B == "b", :] @@ -90,7 +94,9 @@ def test_frame_loc_callable_labels(self): res = df.loc[lambda x: ["A", "C"]] tm.assert_frame_equal(res, df.loc[["A", "C"]]) - res = df.loc[lambda x: ["A", "C"],] # noqa: E231 + res = df.loc[ + lambda x: ["A", "C"], + ] # noqa: E231 tm.assert_frame_equal(res, df.loc[["A", "C"],]) # noqa: E231 res = df.loc[lambda x: ["A", "C"], :] diff --git a/pandas/tests/io/parser/test_index_col.py b/pandas/tests/io/parser/test_index_col.py index 4dfb8d3bd2dc8..66e00f4eb6c1c 100644 --- a/pandas/tests/io/parser/test_index_col.py +++ b/pandas/tests/io/parser/test_index_col.py @@ -21,9 +21,7 @@ def test_index_col_named(all_parsers, with_header): KORD4,19990127, 21:00:00, 21:18:00, -0.9900, 2.0100, 3.6000, 0.0000, 270.0000 KORD5,19990127, 22:00:00, 21:56:00, -0.5900, 1.7100, 5.1000, 0.0000, 290.0000 KORD6,19990127, 23:00:00, 22:56:00, -0.5900, 1.7100, 4.6000, 0.0000, 280.0000""" # noqa - header = ( - "ID,date,NominalTime,ActualTime,TDew,TAir,Windspeed,Precip,WindDir\n" - ) # noqa + header = "ID,date,NominalTime,ActualTime,TDew,TAir,Windspeed,Precip,WindDir\n" if with_header: data = header + no_header diff --git a/pandas/tests/reductions/test_reductions.py b/pandas/tests/reductions/test_reductions.py index 4dfe561831ced..b0ef0c58ca65a 100644 --- a/pandas/tests/reductions/test_reductions.py +++ b/pandas/tests/reductions/test_reductions.py @@ -179,8 +179,8 @@ class TestIndexReductions: [ (0, 400, 3), (500, 0, -6), - (-10 ** 6, 10 ** 6, 4), - (10 ** 6, -10 ** 6, -4), + (-(10 ** 6), 10 ** 6, 4), + (10 ** 6, -(10 ** 6), -4), (0, 10, 20), ], ) diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index 0bc09ddc40035..baf78d7188b41 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -223,10 +223,10 @@ def test_uint64_factorize(self, writable): tm.assert_numpy_array_equal(uniques, expected_uniques) def test_int64_factorize(self, writable): - data = np.array([2 ** 63 - 1, -2 ** 63, 2 ** 63 - 1], dtype=np.int64) + data = np.array([2 ** 63 - 1, -(2 ** 63), 2 ** 63 - 1], dtype=np.int64) data.setflags(write=writable) expected_codes = np.array([0, 1, 0], dtype=np.intp) - expected_uniques = np.array([2 ** 63 - 1, -2 ** 63], dtype=np.int64) + expected_uniques = np.array([2 ** 63 - 1, -(2 ** 63)], dtype=np.int64) codes, uniques = algos.factorize(data) tm.assert_numpy_array_equal(codes, expected_codes) @@ -265,7 +265,7 @@ def test_deprecate_order(self): "data", [ np.array([0, 1, 0], dtype="u8"), - np.array([-2 ** 63, 1, -2 ** 63], dtype="i8"), + np.array([-(2 ** 63), 1, -(2 ** 63)], dtype="i8"), np.array(["__nan__", "foo", "__nan__"], dtype="object"), ], ) @@ -282,8 +282,8 @@ def test_parametrized_factorize_na_value_default(self, data): [ (np.array([0, 1, 0, 2], dtype="u8"), 0), (np.array([1, 0, 1, 2], dtype="u8"), 1), - (np.array([-2 ** 63, 1, -2 ** 63, 0], dtype="i8"), -2 ** 63), - (np.array([1, -2 ** 63, 1, 0], dtype="i8"), 1), + (np.array([-(2 ** 63), 1, -(2 ** 63), 0], dtype="i8"), -(2 ** 63)), + (np.array([1, -(2 ** 63), 1, 0], dtype="i8"), 1), (np.array(["a", "", "a", "b"], dtype=object), "a"), (np.array([(), ("a", 1), (), ("a", 2)], dtype=object), ()), (np.array([("a", 1), (), ("a", 1), ("a", 2)], dtype=object), ("a", 1)), diff --git a/pandas/tests/test_nanops.py b/pandas/tests/test_nanops.py index 7b76a1c0a6c86..e5d963a307502 100644 --- a/pandas/tests/test_nanops.py +++ b/pandas/tests/test_nanops.py @@ -302,7 +302,7 @@ def test_nanmean_overflow(self): # In the previous implementation mean can overflow for int dtypes, it # is now consistent with numpy - for a in [2 ** 55, -2 ** 55, 20150515061816532]: + for a in [2 ** 55, -(2 ** 55), 20150515061816532]: s = Series(a, index=range(500), dtype=np.int64) result = s.mean() np_result = s.values.mean() diff --git a/requirements-dev.txt b/requirements-dev.txt index 33f4e057c3dc9..f589812e81635 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -3,7 +3,7 @@ python-dateutil>=2.6.1 pytz asv cython>=0.29.13 -black<=19.3b0 +black>=19.10b0 cpplint flake8 flake8-comprehensions>=3.1.0 From debaf9a745ab989fdf6bc1cfa1b3bedfb0b40c78 Mon Sep 17 00:00:00 2001 From: Terji Petersen Date: Sun, 17 Nov 2019 20:55:53 +0000 Subject: [PATCH 089/185] TYP: Add type hint for BaseGrouper in groupby._Groupby (#29675) --- pandas/core/groupby/groupby.py | 6 +++--- pandas/core/groupby/grouper.py | 12 ++++++------ 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 294cb723eee1a..3199f166d5b3f 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -48,7 +48,7 @@ class providing the base-class of operations. from pandas.core.construction import extract_array from pandas.core.frame import DataFrame from pandas.core.generic import NDFrame -from pandas.core.groupby import base +from pandas.core.groupby import base, ops from pandas.core.index import CategoricalIndex, Index, MultiIndex from pandas.core.series import Series from pandas.core.sorting import get_group_index_sorter @@ -345,7 +345,7 @@ def __init__( keys=None, axis: int = 0, level=None, - grouper=None, + grouper: "Optional[ops.BaseGrouper]" = None, exclusions=None, selection=None, as_index: bool = True, @@ -2480,7 +2480,7 @@ def get_groupby( by=None, axis: int = 0, level=None, - grouper=None, + grouper: "Optional[ops.BaseGrouper]" = None, exclusions=None, selection=None, as_index: bool = True, diff --git a/pandas/core/groupby/grouper.py b/pandas/core/groupby/grouper.py index eb1442aeb8a4c..c37617b1f1f7f 100644 --- a/pandas/core/groupby/grouper.py +++ b/pandas/core/groupby/grouper.py @@ -26,8 +26,8 @@ from pandas.core.arrays import Categorical, ExtensionArray import pandas.core.common as com from pandas.core.frame import DataFrame +from pandas.core.groupby import ops from pandas.core.groupby.categorical import recode_for_groupby, recode_from_groupby -from pandas.core.groupby.ops import BaseGrouper from pandas.core.index import CategoricalIndex, Index, MultiIndex from pandas.core.series import Series @@ -390,7 +390,7 @@ def ngroups(self) -> int: @cache_readonly def indices(self): # we have a list of groupers - if isinstance(self.grouper, BaseGrouper): + if isinstance(self.grouper, ops.BaseGrouper): return self.grouper.indices values = ensure_categorical(self.grouper) @@ -417,7 +417,7 @@ def group_index(self) -> Index: def _make_codes(self) -> None: if self._codes is None or self._group_index is None: # we have a list of groupers - if isinstance(self.grouper, BaseGrouper): + if isinstance(self.grouper, ops.BaseGrouper): codes = self.grouper.codes_info uniques = self.grouper.result_index else: @@ -440,7 +440,7 @@ def get_grouper( observed: bool = False, mutated: bool = False, validate: bool = True, -) -> Tuple[BaseGrouper, List[Hashable], FrameOrSeries]: +) -> "Tuple[ops.BaseGrouper, List[Hashable], FrameOrSeries]": """ Create and return a BaseGrouper, which is an internal mapping of how to create the grouper indexers. @@ -522,7 +522,7 @@ def get_grouper( return grouper, [key.key], obj # already have a BaseGrouper, just return it - elif isinstance(key, BaseGrouper): + elif isinstance(key, ops.BaseGrouper): return key, [], obj # In the future, a tuple key will always mean an actual key, @@ -669,7 +669,7 @@ def is_in_obj(gpr) -> bool: groupings.append(Grouping(Index([], dtype="int"), np.array([], dtype=np.intp))) # create the internals grouper - grouper = BaseGrouper(group_axis, groupings, sort=sort, mutated=mutated) + grouper = ops.BaseGrouper(group_axis, groupings, sort=sort, mutated=mutated) return grouper, exclusions, obj From 7b861ab60127720267a15591b4e5c7189e59a6ea Mon Sep 17 00:00:00 2001 From: Jeremy Schendel Date: Sun, 17 Nov 2019 15:47:51 -0700 Subject: [PATCH 090/185] CLN: Simplify black command in Makefile (#29679) --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 27a2c3682de9c..f26689ab65ba5 100644 --- a/Makefile +++ b/Makefile @@ -15,7 +15,7 @@ lint-diff: git diff upstream/master --name-only -- "*.py" | xargs flake8 black: - black . --exclude '(asv_bench/env|\.egg|\.git|\.hg|\.mypy_cache|\.nox|\.tox|\.venv|_build|buck-out|build|dist|setup.py)' + black . develop: build python -m pip install --no-build-isolation -e . From 55ef668fc1cddd1bd214387d5b82e438c99b06ab Mon Sep 17 00:00:00 2001 From: alimcmaster1 Date: Sun, 17 Nov 2019 22:48:17 +0000 Subject: [PATCH 091/185] CI: Pin black to version 19.10b0 (#29673) --- environment.yml | 2 +- requirements-dev.txt | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/environment.yml b/environment.yml index 325b79f07a61c..ef5767f26dceb 100644 --- a/environment.yml +++ b/environment.yml @@ -15,7 +15,7 @@ dependencies: - cython>=0.29.13 # code checks - - black>=19.10b0 + - black=19.10b0 - cpplint - flake8 - flake8-comprehensions>=3.1.0 # used by flake8, linting of unnecessary comprehensions diff --git a/requirements-dev.txt b/requirements-dev.txt index f589812e81635..3ae5b57de5d02 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -3,7 +3,7 @@ python-dateutil>=2.6.1 pytz asv cython>=0.29.13 -black>=19.10b0 +black==19.10b0 cpplint flake8 flake8-comprehensions>=3.1.0 From 7038f85e3ca21d7e342067a6c360e56a77499a3b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?O=C4=9Fuzhan=20=C3=96=C4=9Freden?= Date: Sun, 17 Nov 2019 23:53:30 +0100 Subject: [PATCH 092/185] Makes NumericIndex constructor dtype aware (#29529) --- doc/source/whatsnew/v1.0.0.rst | 1 + pandas/core/indexes/base.py | 24 ------------------------ pandas/core/indexes/numeric.py | 13 +++++++++++-- pandas/tests/indexes/test_numeric.py | 5 +++++ 4 files changed, 17 insertions(+), 26 deletions(-) diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index c91ced1014dd1..162f9dea06482 100644 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -343,6 +343,7 @@ Numeric - Bug in :class:`DataFrame` logical operations (`&`, `|`, `^`) not matching :class:`Series` behavior by filling NA values (:issue:`28741`) - Bug in :meth:`DataFrame.interpolate` where specifying axis by name references variable before it is assigned (:issue:`29142`) - Improved error message when using `frac` > 1 and `replace` = False (:issue:`27451`) +- Bug in :class:`UInt64Index` precision loss while constructing from a list with values in the ``np.uint64`` range (:issue:`29526`) - Conversion diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index c554c501ae7ce..2321e077df285 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -4027,30 +4027,6 @@ def _string_data_error(cls, data): "to explicitly cast to a numeric type" ) - @classmethod - def _coerce_to_ndarray(cls, data): - """ - Coerces data to ndarray. - - Converts other iterables to list first and then to array. - Does not touch ndarrays. - - Raises - ------ - TypeError - When the data passed in is a scalar. - """ - - if not isinstance(data, (np.ndarray, Index)): - if data is None or is_scalar(data): - raise cls._scalar_data_error(data) - - # other iterable of some kind - if not isinstance(data, (ABCSeries, list, tuple)): - data = list(data) - data = np.asarray(data) - return data - def _coerce_scalar_to_index(self, item): """ We need to coerce a scalar to a compat for our index type. diff --git a/pandas/core/indexes/numeric.py b/pandas/core/indexes/numeric.py index ee96e4cd699bb..074cce085fb3c 100644 --- a/pandas/core/indexes/numeric.py +++ b/pandas/core/indexes/numeric.py @@ -22,6 +22,7 @@ ABCFloat64Index, ABCInt64Index, ABCRangeIndex, + ABCSeries, ABCUInt64Index, ) from pandas.core.dtypes.missing import isna @@ -55,8 +56,16 @@ def __new__(cls, data=None, dtype=None, copy=False, name=None, fastpath=None): if fastpath: return cls._simple_new(data, name=name) - # is_scalar, generators handled in coerce_to_ndarray - data = cls._coerce_to_ndarray(data) + # Coerce to ndarray if not already ndarray or Index + if not isinstance(data, (np.ndarray, Index)): + if is_scalar(data): + raise cls._scalar_data_error(data) + + # other iterable of some kind + if not isinstance(data, (ABCSeries, list, tuple)): + data = list(data) + + data = np.asarray(data, dtype=dtype) if issubclass(data.dtype.type, str): cls._string_data_error(data) diff --git a/pandas/tests/indexes/test_numeric.py b/pandas/tests/indexes/test_numeric.py index deb63cc9ef854..e0f7b1d1ade3d 100644 --- a/pandas/tests/indexes/test_numeric.py +++ b/pandas/tests/indexes/test_numeric.py @@ -944,6 +944,11 @@ def test_constructor(self): res = Index(np.array([-1, 2 ** 63], dtype=object)) tm.assert_index_equal(res, idx) + # https://github.com/pandas-dev/pandas/issues/29526 + idx = UInt64Index([1, 2 ** 63 + 1], dtype=np.uint64) + res = Index([1, 2 ** 63 + 1], dtype=np.uint64) + tm.assert_index_equal(res, idx) + def test_get_indexer(self, index_large): target = UInt64Index(np.arange(10).astype("uint64") * 5 + 2 ** 63) indexer = index_large.get_indexer(target) From 787ea549b921eab39ca9f8f67a2f77b8645cec33 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Sun, 17 Nov 2019 15:00:11 -0800 Subject: [PATCH 093/185] REF: de-duplicate _apply_to_group (#29628) --- pandas/_libs/reduction.pyx | 53 +++++++++++++++++++++----------------- 1 file changed, 29 insertions(+), 24 deletions(-) diff --git a/pandas/_libs/reduction.pyx b/pandas/_libs/reduction.pyx index 8733249888ae9..f5521b94b6c33 100644 --- a/pandas/_libs/reduction.pyx +++ b/pandas/_libs/reduction.pyx @@ -189,6 +189,27 @@ cdef class _BaseGrouper: return cached_typ, cached_ityp + cdef inline object _apply_to_group(self, + object cached_typ, object cached_ityp, + Slider islider, Slider vslider, + Py_ssize_t group_size, bint initialized): + """ + Call self.f on our new group, then update to the next group. + """ + cached_ityp._engine.clear_mapping() + res = self.f(cached_typ) + res = _extract_result(res) + if not initialized: + # On the first pass, we check the output shape to see + # if this looks like a reduction. + initialized = 1 + _check_result_array(res, len(self.dummy_arr)) + + islider.advance(group_size) + vslider.advance(group_size) + + return res, initialized + cdef class SeriesBinGrouper(_BaseGrouper): """ @@ -217,7 +238,7 @@ cdef class SeriesBinGrouper(_BaseGrouper): self.typ = series._constructor self.ityp = series.index._constructor self.index = series.index.values - self.name = getattr(series, 'name', None) + self.name = series.name self.dummy_arr, self.dummy_index = self._check_dummy(dummy) @@ -265,20 +286,12 @@ cdef class SeriesBinGrouper(_BaseGrouper): cached_typ, cached_ityp = self._update_cached_objs( cached_typ, cached_ityp, islider, vslider) - cached_ityp._engine.clear_mapping() - res = self.f(cached_typ) - res = _extract_result(res) - if not initialized: - # On the first pass, we check the output shape to see - # if this looks like a reduction. - initialized = 1 - _check_result_array(res, len(self.dummy_arr)) + res, initialized = self._apply_to_group(cached_typ, cached_ityp, + islider, vslider, + group_size, initialized) result[i] = res - islider.advance(group_size) - vslider.advance(group_size) - finally: # so we don't free the wrong memory islider.reset() @@ -322,7 +335,7 @@ cdef class SeriesGrouper(_BaseGrouper): self.typ = series._constructor self.ityp = series.index._constructor self.index = series.index.values - self.name = getattr(series, 'name', None) + self.name = series.name self.dummy_arr, self.dummy_index = self._check_dummy(dummy) self.ngroups = ngroups @@ -367,20 +380,12 @@ cdef class SeriesGrouper(_BaseGrouper): cached_typ, cached_ityp = self._update_cached_objs( cached_typ, cached_ityp, islider, vslider) - cached_ityp._engine.clear_mapping() - res = self.f(cached_typ) - res = _extract_result(res) - if not initialized: - # On the first pass, we check the output shape to see - # if this looks like a reduction. - initialized = 1 - _check_result_array(res, len(self.dummy_arr)) + res, initialized = self._apply_to_group(cached_typ, cached_ityp, + islider, vslider, + group_size, initialized) result[lab] = res counts[lab] = group_size - islider.advance(group_size) - vslider.advance(group_size) - group_size = 0 finally: From deceebe01aa9a3e4631828cb5b7453b25e9620bb Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Sun, 17 Nov 2019 15:01:50 -0800 Subject: [PATCH 094/185] BUG: IndexError in __repr__ (#29681) --- pandas/core/computation/pytables.py | 7 ++++-- pandas/io/pytables.py | 37 +++++++++++------------------ 2 files changed, 19 insertions(+), 25 deletions(-) diff --git a/pandas/core/computation/pytables.py b/pandas/core/computation/pytables.py index 13a4814068d6a..4eb39898214c5 100644 --- a/pandas/core/computation/pytables.py +++ b/pandas/core/computation/pytables.py @@ -2,7 +2,7 @@ import ast from functools import partial -from typing import Optional +from typing import Any, Optional, Tuple import numpy as np @@ -72,7 +72,6 @@ def __init__(self, op, lhs, rhs, queryables, encoding): super().__init__(op, lhs, rhs) self.queryables = queryables self.encoding = encoding - self.filter = None self.condition = None def _disallow_scalar_only_bool_ops(self): @@ -230,7 +229,11 @@ def convert_values(self): class FilterBinOp(BinOp): + filter: Optional[Tuple[Any, Any, pd.Index]] = None + def __repr__(self) -> str: + if self.filter is None: + return "Filter: Not Initialized" return pprint_thing( "[Filter : [{lhs}] -> [{op}]".format(lhs=self.filter[0], op=self.filter[1]) ) diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index 7c7b78720d46d..f41c767d0b13a 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -3868,30 +3868,21 @@ def get_blk_items(mgr, blocks): else: existing_col = None - try: - col = klass.create_for_block(i=i, name=name, version=self.version) - col.set_atom( - block=b, - block_items=b_items, - existing_col=existing_col, - min_itemsize=min_itemsize, - nan_rep=nan_rep, - encoding=self.encoding, - errors=self.errors, - info=self.info, - ) - col.set_pos(j) + col = klass.create_for_block(i=i, name=name, version=self.version) + col.set_atom( + block=b, + block_items=b_items, + existing_col=existing_col, + min_itemsize=min_itemsize, + nan_rep=nan_rep, + encoding=self.encoding, + errors=self.errors, + info=self.info, + ) + col.set_pos(j) + + self.values_axes.append(col) - self.values_axes.append(col) - except (NotImplementedError, ValueError, TypeError) as e: - raise e - except Exception as detail: - raise Exception( - "cannot find the correct atom type -> " - "[dtype->{name},items->{items}] {detail!s}".format( - name=b.dtype.name, items=b_items, detail=detail - ) - ) j += 1 # validate our min_itemsize From 5a0f7e9e03976020ba52a7473f90cb1c8a4354c0 Mon Sep 17 00:00:00 2001 From: Elle <42851573+ellequelle@users.noreply.github.com> Date: Sun, 17 Nov 2019 18:06:34 -0500 Subject: [PATCH 095/185] Bug fix GH 29624: calling str.isalpha on empty series returns object dtype, not bool (#29680) --- doc/source/whatsnew/v1.0.0.rst | 2 +- pandas/core/strings.py | 10 ++++++++++ pandas/tests/test_strings.py | 19 ++++++++++--------- 3 files changed, 21 insertions(+), 10 deletions(-) diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index 162f9dea06482..39f33c3bb8cb7 100644 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -355,7 +355,7 @@ Conversion Strings ^^^^^^^ -- +- Calling :meth:`Series.str.isalnum` (and other "ismethods") on an empty Series would return an object dtype instead of bool (:issue:`29624`) - diff --git a/pandas/core/strings.py b/pandas/core/strings.py index a6e0c12526d8a..55ce44d736864 100644 --- a/pandas/core/strings.py +++ b/pandas/core/strings.py @@ -3401,59 +3401,69 @@ def rindex(self, sub, start=0, end=None): _doc_args["istitle"] = dict(type="titlecase", method="istitle") _doc_args["isnumeric"] = dict(type="numeric", method="isnumeric") _doc_args["isdecimal"] = dict(type="decimal", method="isdecimal") + # force _noarg_wrapper return type with dtype=bool (GH 29624) isalnum = _noarg_wrapper( lambda x: x.isalnum(), name="isalnum", docstring=_shared_docs["ismethods"] % _doc_args["isalnum"], returns_string=False, + dtype=bool, ) isalpha = _noarg_wrapper( lambda x: x.isalpha(), name="isalpha", docstring=_shared_docs["ismethods"] % _doc_args["isalpha"], returns_string=False, + dtype=bool, ) isdigit = _noarg_wrapper( lambda x: x.isdigit(), name="isdigit", docstring=_shared_docs["ismethods"] % _doc_args["isdigit"], returns_string=False, + dtype=bool, ) isspace = _noarg_wrapper( lambda x: x.isspace(), name="isspace", docstring=_shared_docs["ismethods"] % _doc_args["isspace"], returns_string=False, + dtype=bool, ) islower = _noarg_wrapper( lambda x: x.islower(), name="islower", docstring=_shared_docs["ismethods"] % _doc_args["islower"], returns_string=False, + dtype=bool, ) isupper = _noarg_wrapper( lambda x: x.isupper(), name="isupper", docstring=_shared_docs["ismethods"] % _doc_args["isupper"], returns_string=False, + dtype=bool, ) istitle = _noarg_wrapper( lambda x: x.istitle(), name="istitle", docstring=_shared_docs["ismethods"] % _doc_args["istitle"], returns_string=False, + dtype=bool, ) isnumeric = _noarg_wrapper( lambda x: x.isnumeric(), name="isnumeric", docstring=_shared_docs["ismethods"] % _doc_args["isnumeric"], returns_string=False, + dtype=bool, ) isdecimal = _noarg_wrapper( lambda x: x.isdecimal(), name="isdecimal", docstring=_shared_docs["ismethods"] % _doc_args["isdecimal"], returns_string=False, + dtype=bool, ) @classmethod diff --git a/pandas/tests/test_strings.py b/pandas/tests/test_strings.py index f5d28ec82d1d4..f68541b620efa 100644 --- a/pandas/tests/test_strings.py +++ b/pandas/tests/test_strings.py @@ -1853,15 +1853,16 @@ def test_empty_str_methods(self): tm.assert_series_equal(empty_str, empty.str.get(0)) tm.assert_series_equal(empty_str, empty_bytes.str.decode("ascii")) tm.assert_series_equal(empty_bytes, empty.str.encode("ascii")) - tm.assert_series_equal(empty_str, empty.str.isalnum()) - tm.assert_series_equal(empty_str, empty.str.isalpha()) - tm.assert_series_equal(empty_str, empty.str.isdigit()) - tm.assert_series_equal(empty_str, empty.str.isspace()) - tm.assert_series_equal(empty_str, empty.str.islower()) - tm.assert_series_equal(empty_str, empty.str.isupper()) - tm.assert_series_equal(empty_str, empty.str.istitle()) - tm.assert_series_equal(empty_str, empty.str.isnumeric()) - tm.assert_series_equal(empty_str, empty.str.isdecimal()) + # ismethods should always return boolean (GH 29624) + tm.assert_series_equal(empty_bool, empty.str.isalnum()) + tm.assert_series_equal(empty_bool, empty.str.isalpha()) + tm.assert_series_equal(empty_bool, empty.str.isdigit()) + tm.assert_series_equal(empty_bool, empty.str.isspace()) + tm.assert_series_equal(empty_bool, empty.str.islower()) + tm.assert_series_equal(empty_bool, empty.str.isupper()) + tm.assert_series_equal(empty_bool, empty.str.istitle()) + tm.assert_series_equal(empty_bool, empty.str.isnumeric()) + tm.assert_series_equal(empty_bool, empty.str.isdecimal()) tm.assert_series_equal(empty_str, empty.str.capitalize()) tm.assert_series_equal(empty_str, empty.str.swapcase()) tm.assert_series_equal(empty_str, empty.str.normalize("NFC")) From b9b462c01819e704aced371b64c6b5886f871b45 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Sun, 17 Nov 2019 16:20:37 -0800 Subject: [PATCH 096/185] CLN: parts of #29667 (#29677) --- pandas/core/computation/eval.py | 4 ++-- pandas/core/computation/ops.py | 4 +++- pandas/core/computation/pytables.py | 7 ++++--- pandas/core/computation/scope.py | 10 +++++----- 4 files changed, 14 insertions(+), 11 deletions(-) diff --git a/pandas/core/computation/eval.py b/pandas/core/computation/eval.py index de2133f64291d..72f2e1d8e23e5 100644 --- a/pandas/core/computation/eval.py +++ b/pandas/core/computation/eval.py @@ -11,7 +11,7 @@ from pandas.core.computation.engines import _engines from pandas.core.computation.expr import Expr, _parsers, tokenize_string -from pandas.core.computation.scope import _ensure_scope +from pandas.core.computation.scope import ensure_scope from pandas.io.formats.printing import pprint_thing @@ -309,7 +309,7 @@ def eval( _check_for_locals(expr, level, parser) # get our (possibly passed-in) scope - env = _ensure_scope( + env = ensure_scope( level + 1, global_dict=global_dict, local_dict=local_dict, diff --git a/pandas/core/computation/ops.py b/pandas/core/computation/ops.py index ce67c3152ecd0..524013ceef5ff 100644 --- a/pandas/core/computation/ops.py +++ b/pandas/core/computation/ops.py @@ -197,7 +197,9 @@ class Op: Hold an operator of arbitrary arity. """ - def __init__(self, op, operands, *args, **kwargs): + op: str + + def __init__(self, op: str, operands, *args, **kwargs): self.op = _bool_op_map.get(op, op) self.operands = operands self.encoding = kwargs.get("encoding", None) diff --git a/pandas/core/computation/pytables.py b/pandas/core/computation/pytables.py index 4eb39898214c5..ff7e713b3e71a 100644 --- a/pandas/core/computation/pytables.py +++ b/pandas/core/computation/pytables.py @@ -13,7 +13,7 @@ import pandas as pd import pandas.core.common as com -from pandas.core.computation import expr, ops +from pandas.core.computation import expr, ops, scope as _scope from pandas.core.computation.common import _ensure_decoded from pandas.core.computation.expr import BaseExprVisitor from pandas.core.computation.ops import UndefinedVariableError, is_term @@ -21,10 +21,10 @@ from pandas.io.formats.printing import pprint_thing, pprint_thing_encoded -class Scope(expr.Scope): +class Scope(_scope.Scope): __slots__ = ("queryables",) - def __init__(self, level, global_dict=None, local_dict=None, queryables=None): + def __init__(self, level: int, global_dict=None, local_dict=None, queryables=None): super().__init__(level + 1, global_dict=global_dict, local_dict=local_dict) self.queryables = queryables or dict() @@ -40,6 +40,7 @@ def __init__(self, name, env, side=None, encoding=None): def _resolve_name(self): # must be a queryables if self.side == "left": + # Note: The behavior of __new__ ensures that self.name is a str here if self.name not in self.env.queryables: raise NameError("name {name!r} is not defined".format(name=self.name)) return self.name diff --git a/pandas/core/computation/scope.py b/pandas/core/computation/scope.py index ee82664f6cb21..2c5c687a44680 100644 --- a/pandas/core/computation/scope.py +++ b/pandas/core/computation/scope.py @@ -16,9 +16,9 @@ from pandas.compat.chainmap import DeepChainMap -def _ensure_scope( - level, global_dict=None, local_dict=None, resolvers=(), target=None, **kwargs -): +def ensure_scope( + level: int, global_dict=None, local_dict=None, resolvers=(), target=None, **kwargs +) -> "Scope": """Ensure that we are grabbing the correct scope.""" return Scope( level + 1, @@ -119,7 +119,7 @@ def __init__( self.scope.update(local_dict.scope) if local_dict.target is not None: self.target = local_dict.target - self.update(local_dict.level) + self._update(local_dict.level) frame = sys._getframe(self.level) @@ -251,7 +251,7 @@ def _get_vars(self, stack, scopes): # scope after the loop del frame - def update(self, level: int): + def _update(self, level: int): """ Update the current scope by going back `level` levels. From e1cadfa289c765978b812fc39e6af3e8d209d172 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Sun, 17 Nov 2019 16:27:06 -0800 Subject: [PATCH 097/185] REF: make _aggregate_series_pure_python extraction behave like the cython version (#29641) --- pandas/core/groupby/groupby.py | 25 ++++++-------------- pandas/core/groupby/ops.py | 14 ++++++++--- pandas/tests/groupby/aggregate/test_other.py | 25 ++++++++++++++++++++ 3 files changed, 43 insertions(+), 21 deletions(-) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 3199f166d5b3f..236df4b3854a4 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -31,7 +31,6 @@ class providing the base-class of operations. from pandas.core.dtypes.common import ( ensure_float, is_datetime64_dtype, - is_datetime64tz_dtype, is_extension_array_dtype, is_integer_dtype, is_numeric_dtype, @@ -45,7 +44,6 @@ class providing the base-class of operations. from pandas.core.arrays import Categorical, try_cast_to_ea from pandas.core.base import DataError, PandasObject, SelectionMixin import pandas.core.common as com -from pandas.core.construction import extract_array from pandas.core.frame import DataFrame from pandas.core.generic import NDFrame from pandas.core.groupby import base, ops @@ -790,22 +788,11 @@ def _try_cast(self, result, obj, numeric_only: bool = False): dtype = obj.dtype if not is_scalar(result): - if is_datetime64tz_dtype(dtype): - # GH 23683 - # Prior results _may_ have been generated in UTC. - # Ensure we localize to UTC first before converting - # to the target timezone - arr = extract_array(obj) - try: - result = arr._from_sequence(result, dtype="datetime64[ns, UTC]") - result = result.astype(dtype) - except TypeError: - # _try_cast was called at a point where the result - # was already tz-aware - pass - elif is_extension_array_dtype(dtype): + if is_extension_array_dtype(dtype) and dtype.kind != "M": # The function can return something of any type, so check - # if the type is compatible with the calling EA. + # if the type is compatible with the calling EA. + # datetime64tz is handled correctly in agg_series, + # so is excluded here. # return the same type (Series) as our caller cls = dtype.construct_array_type() @@ -872,7 +859,9 @@ def _cython_agg_general( if numeric_only and not is_numeric: continue - result, names = self.grouper.aggregate(obj.values, how, min_count=min_count) + result, names = self.grouper.aggregate( + obj._values, how, min_count=min_count + ) output[name] = self._try_cast(result, obj) if len(output) == 0: diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index 7ed79e4b00371..47ca2b2190ecf 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -604,11 +604,11 @@ def agg_series(self, obj: Series, func): # SeriesGrouper would raise if we were to call _aggregate_series_fast return self._aggregate_series_pure_python(obj, func) - elif is_extension_array_dtype(obj.dtype) and obj.dtype.kind != "M": + elif is_extension_array_dtype(obj.dtype): # _aggregate_series_fast would raise TypeError when # calling libreduction.Slider + # In the datetime64tz case it would incorrectly cast to tz-naive # TODO: can we get a performant workaround for EAs backed by ndarray? - # TODO: is the datetime64tz case supposed to go through here? return self._aggregate_series_pure_python(obj, func) elif isinstance(obj.index, MultiIndex): @@ -657,7 +657,15 @@ def _aggregate_series_pure_python(self, obj: Series, func): res = func(group) if result is None: if isinstance(res, (Series, Index, np.ndarray)): - raise ValueError("Function does not reduce") + if len(res) == 1: + # e.g. test_agg_lambda_with_timezone lambda e: e.head(1) + # FIXME: are we potentially losing import res.index info? + + # TODO: use `.item()` if/when we un-deprecate it. + # For non-Series we could just do `res[0]` + res = next(iter(res)) + else: + raise ValueError("Function does not reduce") result = np.empty(ngroups, dtype="O") counts[label] = group.shape[0] diff --git a/pandas/tests/groupby/aggregate/test_other.py b/pandas/tests/groupby/aggregate/test_other.py index 1c297f3e2ada3..721045f1097f8 100644 --- a/pandas/tests/groupby/aggregate/test_other.py +++ b/pandas/tests/groupby/aggregate/test_other.py @@ -454,6 +454,31 @@ def test_agg_over_numpy_arrays(): tm.assert_frame_equal(result, expected) +def test_agg_tzaware_non_datetime_result(): + # discussed in GH#29589, fixed in GH#29641, operating on tzaware values + # with function that is not dtype-preserving + dti = pd.date_range("2012-01-01", periods=4, tz="UTC") + df = pd.DataFrame({"a": [0, 0, 1, 1], "b": dti}) + gb = df.groupby("a") + + # Case that _does_ preserve the dtype + result = gb["b"].agg(lambda x: x.iloc[0]) + expected = pd.Series(dti[::2], name="b") + expected.index.name = "a" + tm.assert_series_equal(result, expected) + + # Cases that do _not_ preserve the dtype + result = gb["b"].agg(lambda x: x.iloc[0].year) + expected = pd.Series([2012, 2012], name="b") + expected.index.name = "a" + tm.assert_series_equal(result, expected) + + result = gb["b"].agg(lambda x: x.iloc[-1] - x.iloc[0]) + expected = pd.Series([pd.Timedelta(days=1), pd.Timedelta(days=1)], name="b") + expected.index.name = "a" + tm.assert_series_equal(result, expected) + + def test_agg_timezone_round_trip(): # GH 15426 ts = pd.Timestamp("2016-01-01 12:00:00", tz="US/Pacific") From cd5670850f18e81317ce78f2c1e69aad10f56809 Mon Sep 17 00:00:00 2001 From: Paul Siegel Date: Sun, 17 Nov 2019 19:33:34 -0500 Subject: [PATCH 098/185] BUG: `RollingGroupBy.quantile` ignores `interpolation` keyword argument (#29567) --- doc/source/whatsnew/v1.0.0.rst | 1 + pandas/core/window/rolling.py | 4 +++- pandas/tests/window/test_grouper.py | 25 ++++++++++++++++++++----- 3 files changed, 24 insertions(+), 6 deletions(-) diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index 39f33c3bb8cb7..cb68bd0e762c4 100644 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -437,6 +437,7 @@ Groupby/resample/rolling - Bug in :meth:`DataFrame.groupby` not offering selection by column name when ``axis=1`` (:issue:`27614`) - Bug in :meth:`DataFrameGroupby.agg` not able to use lambda function with named aggregation (:issue:`27519`) - Bug in :meth:`DataFrame.groupby` losing column name information when grouping by a categorical column (:issue:`28787`) +- Bug in :meth:`DataFrameGroupBy.rolling().quantile()` ignoring ``interpolation`` keyword argument (:issue:`28779`) Reshaping ^^^^^^^^^ diff --git a/pandas/core/window/rolling.py b/pandas/core/window/rolling.py index fd221c53e244c..bec350f6b7d8b 100644 --- a/pandas/core/window/rolling.py +++ b/pandas/core/window/rolling.py @@ -1499,7 +1499,9 @@ def f(arg, *args, **kwargs): interpolation, ) - return self._apply(f, "quantile", quantile=quantile, **kwargs) + return self._apply( + f, "quantile", quantile=quantile, interpolation=interpolation, **kwargs + ) _shared_docs[ "cov" diff --git a/pandas/tests/window/test_grouper.py b/pandas/tests/window/test_grouper.py index c278897e1d395..189942bc07d2a 100644 --- a/pandas/tests/window/test_grouper.py +++ b/pandas/tests/window/test_grouper.py @@ -60,7 +60,6 @@ def test_rolling(self): r = g.rolling(window=4) for f in ["sum", "mean", "min", "max", "count", "kurt", "skew"]: - result = getattr(r, f)() expected = g.apply(lambda x: getattr(x.rolling(4), f)()) tm.assert_frame_equal(result, expected) @@ -70,8 +69,16 @@ def test_rolling(self): expected = g.apply(lambda x: getattr(x.rolling(4), f)(ddof=1)) tm.assert_frame_equal(result, expected) - result = r.quantile(0.5) - expected = g.apply(lambda x: x.rolling(4).quantile(0.5)) + @pytest.mark.parametrize( + "interpolation", ["linear", "lower", "higher", "midpoint", "nearest"] + ) + def test_rolling_quantile(self, interpolation): + g = self.frame.groupby("A") + r = g.rolling(window=4) + result = r.quantile(0.4, interpolation=interpolation) + expected = g.apply( + lambda x: x.rolling(4).quantile(0.4, interpolation=interpolation) + ) tm.assert_frame_equal(result, expected) def test_rolling_corr_cov(self): @@ -142,8 +149,16 @@ def test_expanding(self): expected = g.apply(lambda x: getattr(x.expanding(), f)(ddof=0)) tm.assert_frame_equal(result, expected) - result = r.quantile(0.5) - expected = g.apply(lambda x: x.expanding().quantile(0.5)) + @pytest.mark.parametrize( + "interpolation", ["linear", "lower", "higher", "midpoint", "nearest"] + ) + def test_expanding_quantile(self, interpolation): + g = self.frame.groupby("A") + r = g.expanding() + result = r.quantile(0.4, interpolation=interpolation) + expected = g.apply( + lambda x: x.expanding().quantile(0.4, interpolation=interpolation) + ) tm.assert_frame_equal(result, expected) def test_expanding_corr_cov(self): From 698522f0b70e56d68ade0e4102a3675e52b28ebd Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Sun, 17 Nov 2019 16:34:44 -0800 Subject: [PATCH 099/185] TYP: add string annotations in io.pytables (#29682) --- pandas/io/pytables.py | 76 ++++++++++++++++++++++++++----------------- 1 file changed, 46 insertions(+), 30 deletions(-) diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index f41c767d0b13a..193b8f5053d65 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -9,7 +9,7 @@ import os import re import time -from typing import List, Optional, Type, Union +from typing import TYPE_CHECKING, Any, Dict, List, Optional, Type, Union import warnings import numpy as np @@ -55,6 +55,10 @@ from pandas.io.common import _stringify_path from pandas.io.formats.printing import adjoin, pprint_thing +if TYPE_CHECKING: + from tables import File # noqa:F401 + + # versioning attribute _version = "0.15.2" @@ -465,6 +469,8 @@ class HDFStore: >>> store.close() """ + _handle: Optional["File"] + def __init__( self, path, @@ -535,7 +541,7 @@ def __getattr__(self, name): ) ) - def __contains__(self, key): + def __contains__(self, key: str): """ check for existence of this key can match the exact pathname or the pathnm w/o the leading '/' """ @@ -560,7 +566,7 @@ def __enter__(self): def __exit__(self, exc_type, exc_value, traceback): self.close() - def keys(self): + def keys(self) -> List[str]: """ Return a list of keys corresponding to objects stored in HDFStore. @@ -698,13 +704,13 @@ def flush(self, fsync: bool = False): except OSError: pass - def get(self, key): + def get(self, key: str): """ Retrieve pandas object stored in file. Parameters ---------- - key : object + key : str Returns ------- @@ -718,7 +724,7 @@ def get(self, key): def select( self, - key, + key: str, where=None, start=None, stop=None, @@ -733,7 +739,7 @@ def select( Parameters ---------- - key : object + key : str Object being retrieved from file. where : list, default None List of Term (or convertible) objects, optional. @@ -784,13 +790,15 @@ def func(_start, _stop, _where): return it.get_result() - def select_as_coordinates(self, key, where=None, start=None, stop=None, **kwargs): + def select_as_coordinates( + self, key: str, where=None, start=None, stop=None, **kwargs + ): """ return the selection as an Index Parameters ---------- - key : object + key : str where : list of Term (or convertible) objects, optional start : integer (defaults to None), row number to start selection stop : integer (defaults to None), row number to stop selection @@ -800,15 +808,16 @@ def select_as_coordinates(self, key, where=None, start=None, stop=None, **kwargs where=where, start=start, stop=stop, **kwargs ) - def select_column(self, key, column, **kwargs): + def select_column(self, key: str, column: str, **kwargs): """ return a single column from the table. This is generally only useful to select an indexable Parameters ---------- - key : object - column: the column of interest + key : str + column: str + The column of interest. Raises ------ @@ -966,7 +975,7 @@ def put(self, key, value, format=None, append=False, **kwargs): kwargs = self._validate_format(format, kwargs) self._write_to_group(key, value, append=append, **kwargs) - def remove(self, key, where=None, start=None, stop=None): + def remove(self, key: str, where=None, start=None, stop=None): """ Remove pandas object partially by specifying the where condition @@ -1152,16 +1161,17 @@ def append_to_multiple( self.append(k, val, data_columns=dc, **kwargs) - def create_table_index(self, key, **kwargs): - """ Create a pytables index on the table + def create_table_index(self, key: str, **kwargs): + """ + Create a pytables index on the table. + Parameters ---------- - key : object (the node to index) + key : str Raises ------ - raises if the node is not a table - + TypeError: raises if the node is not a table """ # version requirements @@ -1247,17 +1257,19 @@ def walk(self, where="/"): yield (g._v_pathname.rstrip("/"), groups, leaves) - def get_node(self, key): + def get_node(self, key: str): """ return the node with the key or None if it does not exist """ self._check_if_open() + if not key.startswith("/"): + key = "/" + key + + assert self._handle is not None try: - if not key.startswith("/"): - key = "/" + key return self._handle.get_node(self.root, key) - except _table_mod.exceptions.NoSuchNodeError: + except _table_mod.exceptions.NoSuchNodeError: # type: ignore return None - def get_storer(self, key): + def get_storer(self, key: str): """ return the storer object for a key, raise if not in the file """ group = self.get_node(key) if group is None: @@ -1481,7 +1493,7 @@ def error(t): def _write_to_group( self, - key, + key: str, value, format, index=True, @@ -1492,6 +1504,10 @@ def _write_to_group( ): group = self.get_node(key) + # we make this assertion for mypy; the get_node call will already + # have raised if this is incorrect + assert self._handle is not None + # remove the node if we are not appending if group is not None and not append: self._handle.remove_node(group, recursive=True) @@ -2691,7 +2707,7 @@ def f(values, freq=None, tz=None): return klass - def validate_read(self, kwargs): + def validate_read(self, kwargs: Dict[str, Any]) -> Dict[str, Any]: """ remove table keywords from kwargs and return raise if any keywords are passed which are not-None @@ -2733,7 +2749,7 @@ def get_attrs(self): def write(self, obj, **kwargs): self.set_attrs() - def read_array(self, key, start=None, stop=None): + def read_array(self, key: str, start=None, stop=None): """ read an array for the specified node (off of group """ import tables @@ -4008,7 +4024,7 @@ def read_coordinates(self, where=None, start=None, stop=None, **kwargs): return Index(coords) - def read_column(self, column, where=None, start=None, stop=None): + def read_column(self, column: str, where=None, start=None, stop=None): """return a single column from the table, generally only indexables are interesting """ @@ -4642,8 +4658,8 @@ def _convert_index(index, encoding=None, errors="strict", format_type=None): converted, "datetime64", _tables().Int64Col(), - freq=getattr(index, "freq", None), - tz=getattr(index, "tz", None), + freq=index.freq, + tz=index.tz, index_name=index_name, ) elif isinstance(index, TimedeltaIndex): @@ -4652,7 +4668,7 @@ def _convert_index(index, encoding=None, errors="strict", format_type=None): converted, "timedelta64", _tables().Int64Col(), - freq=getattr(index, "freq", None), + freq=index.freq, index_name=index_name, ) elif isinstance(index, (Int64Index, PeriodIndex)): From 4bdce988bc9c82f72b06365924f16f97043d3eb5 Mon Sep 17 00:00:00 2001 From: Marc Garcia Date: Mon, 18 Nov 2019 00:37:04 +0000 Subject: [PATCH 100/185] CI: GitHub action for checks (linting, docstrings...) (#29546) --- .github/workflows/activate.yml | 21 ------- .github/workflows/ci.yml | 103 +++++++++++++++++++++++++++++++++ 2 files changed, 103 insertions(+), 21 deletions(-) delete mode 100644 .github/workflows/activate.yml create mode 100644 .github/workflows/ci.yml diff --git a/.github/workflows/activate.yml b/.github/workflows/activate.yml deleted file mode 100644 index 83838ea3d3941..0000000000000 --- a/.github/workflows/activate.yml +++ /dev/null @@ -1,21 +0,0 @@ -# Simple first task to activate GitHub actions. -# This won't run until it is merged, but future actions will -# run on PRs, so we can see we don't break things in more -# complex actions added later, like real builds. -# -# TODO: Remove this once another action exists -name: Activate - -on: - push: - branches: master - pull_request: - branches: master - -jobs: - activate: - name: Activate actions - runs-on: ubuntu-latest - steps: - - name: Activate - run: echo "GitHub actions ok" diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml new file mode 100644 index 0000000000000..5aa31e0ed3ab0 --- /dev/null +++ b/.github/workflows/ci.yml @@ -0,0 +1,103 @@ +name: CI + +on: + push: + branches: master + pull_request: + branches: master + +env: + ENV_FILE: environment.yml + # TODO: remove export PATH=... in each step once this works + # PATH: $HOME/miniconda3/bin:$PATH + +jobs: + checks: + name: Checks + runs-on: ubuntu-latest + steps: + + - name: Checkout + uses: actions/checkout@v1 + + - name: Looking for unwanted patterns + run: ci/code_checks.sh patterns + if: true + + - name: Setup environment and build pandas + run: | + export PATH=$HOME/miniconda3/bin:$PATH + ci/setup_env.sh + if: true + + - name: Linting + run: | + export PATH=$HOME/miniconda3/bin:$PATH + source activate pandas-dev + ci/code_checks.sh lint + if: true + + - name: Dependencies consistency + run: | + export PATH=$HOME/miniconda3/bin:$PATH + source activate pandas-dev + ci/code_checks.sh dependencies + if: true + + - name: Checks on imported code + run: | + export PATH=$HOME/miniconda3/bin:$PATH + source activate pandas-dev + ci/code_checks.sh code + if: true + + - name: Running doctests + run: | + export PATH=$HOME/miniconda3/bin:$PATH + source activate pandas-dev + ci/code_checks.sh doctests + if: true + + - name: Docstring validation + run: | + export PATH=$HOME/miniconda3/bin:$PATH + source activate pandas-dev + ci/code_checks.sh docstrings + if: true + + - name: Typing validation + run: | + export PATH=$HOME/miniconda3/bin:$PATH + source activate pandas-dev + ci/code_checks.sh typing + if: true + + - name: Testing docstring validation script + run: | + export PATH=$HOME/miniconda3/bin:$PATH + source activate pandas-dev + pytest --capture=no --strict scripts + if: true + + - name: Running benchmarks + run: | + export PATH=$HOME/miniconda3/bin:$PATH + source activate pandas-dev + cd asv_bench + asv check -E existing + git remote add upstream https://github.com/pandas-dev/pandas.git + git fetch upstream + if git diff upstream/master --name-only | grep -q "^asv_bench/"; then + asv machine --yes + ASV_OUTPUT="$(asv dev)" + if [[ $(echo "$ASV_OUTPUT" | grep "failed") ]]; then + echo "##vso[task.logissue type=error]Benchmarks run with errors" + echo "$ASV_OUTPUT" + exit 1 + else + echo "Benchmarks run without errors" + fi + else + echo "Benchmarks did not run, no changes detected" + fi + if: true From c23649143781c658f792e8f7a5b4368ed01f719c Mon Sep 17 00:00:00 2001 From: alimcmaster1 Date: Mon, 18 Nov 2019 00:39:28 +0000 Subject: [PATCH 101/185] CLN: String formatting % -> f-strings (#29518) --- pandas/core/arrays/period.py | 2 +- pandas/core/common.py | 2 +- pandas/core/frame.py | 15 ++++++------ pandas/core/generic.py | 28 +++++++++++------------ pandas/core/indexes/base.py | 24 +++++++++---------- pandas/core/indexes/category.py | 2 +- pandas/core/indexes/datetimelike.py | 11 +++------ pandas/core/indexes/datetimes.py | 2 +- pandas/core/indexes/frozen.py | 4 ++-- pandas/core/indexes/multi.py | 19 ++++++++------- pandas/core/series.py | 11 ++++----- pandas/core/util/hashing.py | 2 +- pandas/io/clipboard/__init__.py | 6 ++--- pandas/io/sas/sas_xport.py | 2 +- pandas/io/stata.py | 2 +- pandas/plotting/_matplotlib/converter.py | 3 ++- pandas/tests/frame/test_query_eval.py | 2 +- pandas/tests/internals/test_internals.py | 2 +- pandas/tests/io/formats/test_style.py | 2 +- pandas/tests/util/test_validate_kwargs.py | 6 ++--- pandas/util/_test_decorators.py | 2 +- 21 files changed, 70 insertions(+), 79 deletions(-) diff --git a/pandas/core/arrays/period.py b/pandas/core/arrays/period.py index fdf4059fad569..f3d51b28ad399 100644 --- a/pandas/core/arrays/period.py +++ b/pandas/core/arrays/period.py @@ -596,7 +596,7 @@ def _format_native_types(self, na_rep="NaT", date_format=None, **kwargs): if date_format: formatter = lambda dt: dt.strftime(date_format) else: - formatter = lambda dt: "%s" % dt + formatter = lambda dt: str(dt) if self._hasnans: mask = self._isnan diff --git a/pandas/core/common.py b/pandas/core/common.py index 133e60de5d694..41b6ebbd2f196 100644 --- a/pandas/core/common.py +++ b/pandas/core/common.py @@ -451,7 +451,7 @@ def pipe(obj, func, *args, **kwargs): if isinstance(func, tuple): func, target = func if target in kwargs: - msg = "%s is both the pipe target and a keyword argument" % target + msg = f"{target} is both the pipe target and a keyword argument" raise ValueError(msg) kwargs[target] = obj return func(*args, **kwargs) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index fae5a6b549af7..0b76566adf802 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -1805,7 +1805,7 @@ def to_records( if isinstance(self.index, ABCMultiIndex): for i, n in enumerate(index_names): if n is None: - index_names[i] = "level_%d" % count + index_names[i] = f"level_{count}" count += 1 elif index_names[0] is None: index_names = ["index"] @@ -2454,7 +2454,7 @@ def info( exceeds_info_cols = len(self.columns) > max_cols def _verbose_repr(): - lines.append("Data columns (total %d columns):" % len(self.columns)) + lines.append(f"Data columns (total {len(self.columns)} columns):") space = max(len(pprint_thing(k)) for k in self.columns) + 4 counts = None @@ -2846,7 +2846,7 @@ def _getitem_bool_array(self, key): ) elif len(key) != len(self.index): raise ValueError( - "Item wrong length %d instead of %d." % (len(key), len(self.index)) + f"Item wrong length {len(key)} instead of {len(self.index)}." ) # check_bool_indexer will throw exception if Series key cannot @@ -2957,7 +2957,7 @@ def _setitem_array(self, key, value): if com.is_bool_indexer(key): if len(key) != len(self.index): raise ValueError( - "Item wrong length %d instead of %d!" % (len(key), len(self.index)) + f"Item wrong length {len(key)} instead of {len(self.index)}!" ) key = check_bool_indexer(self.index, key) indexer = key.nonzero()[0] @@ -4537,8 +4537,8 @@ def _maybe_casted_values(index, labels=None): if not drop: if isinstance(self.index, ABCMultiIndex): names = [ - n if n is not None else ("level_%d" % i) - for (i, n) in enumerate(self.index.names) + (n if n is not None else f"level_{i}") + for i, n in enumerate(self.index.names) ] to_insert = zip(self.index.levels, self.index.codes) else: @@ -4858,8 +4858,7 @@ def sort_values( by = [by] if is_sequence(ascending) and len(by) != len(ascending): raise ValueError( - "Length of ascending (%d) != length of by (%d)" - % (len(ascending), len(by)) + f"Length of ascending ({len(ascending)}) != length of by ({len(by)})" ) if len(by) > 1: from pandas.core.sorting import lexsort_indexer diff --git a/pandas/core/generic.py b/pandas/core/generic.py index d76c870d6227e..982a57a6f725e 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -421,8 +421,7 @@ def _construct_axes_from_arguments( if a in kwargs: if alias in kwargs: raise TypeError( - "arguments are mutually exclusive " - "for [%s,%s]" % (a, alias) + f"arguments are mutually exclusive for [{a},{alias}]" ) continue if alias in kwargs: @@ -754,7 +753,7 @@ def transpose(self, *args, **kwargs): # we must have unique axes if len(axes) != len(set(axes)): - raise ValueError("Must specify %s unique axes" % self._AXIS_LEN) + raise ValueError(f"Must specify {self._AXIS_LEN} unique axes") new_axes = self._construct_axes_dict_from( self, [self._get_axis(x) for x in axes_names] @@ -2096,7 +2095,7 @@ def __repr__(self) -> str: # string representation based upon iterating over self # (since, by definition, `PandasContainers` are iterable) prepr = "[%s]" % ",".join(map(pprint_thing, self)) - return "%s(%s)" % (self.__class__.__name__, prepr) + return f"{self.__class__.__name__}({prepr})" def _repr_latex_(self): """ @@ -6357,7 +6356,7 @@ def fillna( elif isinstance(value, ABCDataFrame) and self.ndim == 2: new_data = self.where(self.notna(), value) else: - raise ValueError("invalid fill value with a %s" % type(value)) + raise ValueError(f"invalid fill value with a {type(value)}") if inplace: self._update_inplace(new_data) @@ -6794,9 +6793,8 @@ def replace( if is_list_like(value): if len(to_replace) != len(value): raise ValueError( - "Replacement lists must match " - "in length. Expecting %d got %d " - % (len(to_replace), len(value)) + f"Replacement lists must match in length. " + f"Expecting {len(to_replace)} got {len(value)} " ) new_data = self._data.replace_list( @@ -8871,7 +8869,7 @@ def align( fill_axis=fill_axis, ) else: # pragma: no cover - raise TypeError("unsupported type: %s" % type(other)) + raise TypeError(f"unsupported type: {type(other)}") def _align_frame( self, @@ -9515,9 +9513,9 @@ def tshift(self, periods=1, freq=None, axis=0): new_data = self._data.copy() new_data.axes[block_axis] = index.shift(periods) else: - msg = "Given freq %s does not match PeriodIndex freq %s" % ( - freq.rule_code, - orig_freq.rule_code, + msg = ( + f"Given freq {freq.rule_code} does not match" + f" PeriodIndex freq {orig_freq.rule_code}" ) raise ValueError(msg) else: @@ -9665,7 +9663,7 @@ def truncate(self, before=None, after=None, axis=None, copy=True): if before is not None and after is not None: if before > after: - raise ValueError("Truncate: %s must be after %s" % (after, before)) + raise ValueError(f"Truncate: {after} must be after {before}") slicer = [slice(None, None)] * self._AXIS_LEN slicer[axis] = slice(before, after) @@ -9711,7 +9709,7 @@ def _tz_convert(ax, tz): if len(ax) > 0: ax_name = self._get_axis_name(axis) raise TypeError( - "%s is not a valid DatetimeIndex or PeriodIndex" % ax_name + f"{ax_name} is not a valid DatetimeIndex or PeriodIndex" ) else: ax = DatetimeIndex([], tz=tz) @@ -9875,7 +9873,7 @@ def _tz_localize(ax, tz, ambiguous, nonexistent): if len(ax) > 0: ax_name = self._get_axis_name(axis) raise TypeError( - "%s is not a valid DatetimeIndex or PeriodIndex" % ax_name + f"{ax_name} is not a valid DatetimeIndex or PeriodIndex" ) else: ax = DatetimeIndex([], tz=tz) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 2321e077df285..86664a14e91dd 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -963,14 +963,14 @@ def __repr__(self): data = self._format_data() attrs = self._format_attrs() space = self._format_space() - - prepr = (",%s" % space).join("%s=%s" % (k, v) for k, v in attrs) + attrs_str = [f"{k}={v}" for k, v in attrs] + prepr = f",{space}".join(attrs_str) # no data provided, just attributes if data is None: data = "" - res = "%s(%s%s)" % (klass, data, prepr) + res = f"{klass}({data}{prepr})" return res @@ -1124,13 +1124,13 @@ def _summary(self, name=None): tail = self[-1] if hasattr(tail, "format") and not isinstance(tail, str): tail = tail.format() - index_summary = ", %s to %s" % (pprint_thing(head), pprint_thing(tail)) + index_summary = f", {head} to {tail}" else: index_summary = "" if name is None: name = type(self).__name__ - return "%s: %s entries%s" % (name, len(self), index_summary) + return f"{name}: {len(self)} entries{index_summary}" def summary(self, name=None): """ @@ -1304,7 +1304,7 @@ def _set_names(self, values, level=None): if not is_list_like(values): raise ValueError("Names must be a list-like") if len(values) != 1: - raise ValueError("Length of new names must be 1, got %d" % len(values)) + raise ValueError(f"Length of new names must be 1, got {len(values)}") # GH 20527 # All items in 'name' need to be hashable: @@ -1475,8 +1475,8 @@ def _validate_index_level(self, level): if isinstance(level, int): if level < 0 and level != -1: raise IndexError( - "Too many levels: Index has only 1 level," - " %d is not a valid level number" % (level,) + f"Too many levels: Index has only 1 level," + f" {level} is not a valid level number" ) elif level > 0: raise IndexError( @@ -4540,7 +4540,7 @@ def shift(self, periods=1, freq=None): '2012-03-01'], dtype='datetime64[ns]', freq='MS') """ - raise NotImplementedError("Not supported for type %s" % type(self).__name__) + raise NotImplementedError(f"Not supported for type {type(self).__name__}") def argsort(self, *args, **kwargs): """ @@ -5047,8 +5047,8 @@ def get_slice_bound(self, label, side, kind): if side not in ("left", "right"): raise ValueError( - "Invalid value for side kwarg," - " must be either 'left' or 'right': %s" % (side,) + f"Invalid value for side kwarg, must be either" + f" 'left' or 'right': {side}" ) original_label = label @@ -5602,7 +5602,7 @@ def _trim_front(strings): def _validate_join_method(method): if method not in ["left", "right", "inner", "outer"]: - raise ValueError("do not recognize join method %s" % method) + raise ValueError(f"do not recognize join method {method}") def default_index(n): diff --git a/pandas/core/indexes/category.py b/pandas/core/indexes/category.py index 819f8ac53197a..e0ffc726bc3a1 100644 --- a/pandas/core/indexes/category.py +++ b/pandas/core/indexes/category.py @@ -357,7 +357,7 @@ def _format_attrs(self): ] if self.name is not None: attrs.append(("name", ibase.default_pprint(self.name))) - attrs.append(("dtype", "'%s'" % self.dtype.name)) + attrs.append(("dtype", f"'{self.dtype.name}'")) max_seq_items = get_option("display.max_seq_items") or len(self) if len(self) > max_seq_items: attrs.append(("length", len(self))) diff --git a/pandas/core/indexes/datetimelike.py b/pandas/core/indexes/datetimelike.py index ceb23f61ae15a..b8670b765ca90 100644 --- a/pandas/core/indexes/datetimelike.py +++ b/pandas/core/indexes/datetimelike.py @@ -36,7 +36,6 @@ from pandas.core.indexes.base import Index, _index_shared_docs from pandas.core.tools.timedeltas import to_timedelta -import pandas.io.formats.printing as printing from pandas.tseries.frequencies import to_offset _index_doc_kwargs = dict(ibase._index_doc_kwargs) @@ -496,7 +495,7 @@ def _format_attrs(self): if attrib == "freq": freq = self.freqstr if freq is not None: - freq = "'%s'" % freq + freq = f"{freq!r}" attrs.append(("freq", freq)) return attrs @@ -686,17 +685,13 @@ def _summary(self, name=None): """ formatter = self._formatter_func if len(self) > 0: - index_summary = ", %s to %s" % (formatter(self[0]), formatter(self[-1])) + index_summary = f", {formatter(self[0])} to {formatter(self[-1])}" else: index_summary = "" if name is None: name = type(self).__name__ - result = "%s: %s entries%s" % ( - printing.pprint_thing(name), - len(self), - index_summary, - ) + result = f"{name}: {len(self)} entries{index_summary}" if self.freq: result += "\nFreq: %s" % self.freqstr diff --git a/pandas/core/indexes/datetimes.py b/pandas/core/indexes/datetimes.py index 41f5eb90d51b0..4a95f0a2ab7e9 100644 --- a/pandas/core/indexes/datetimes.py +++ b/pandas/core/indexes/datetimes.py @@ -491,7 +491,7 @@ def _formatter_func(self): from pandas.io.formats.format import _get_format_datetime64 formatter = _get_format_datetime64(is_dates_only=self._is_dates_only) - return lambda x: "'%s'" % formatter(x, tz=self.tz) + return lambda x: f"'{formatter(x, tz=self.tz)}'" # -------------------------------------------------------------------- # Set Operation Methods diff --git a/pandas/core/indexes/frozen.py b/pandas/core/indexes/frozen.py index 08c86b81b59c0..1b33269d404d6 100644 --- a/pandas/core/indexes/frozen.py +++ b/pandas/core/indexes/frozen.py @@ -109,7 +109,7 @@ def __str__(self) -> str: return pprint_thing(self, quote_strings=True, escape_chars=("\t", "\r", "\n")) def __repr__(self) -> str: - return "%s(%s)" % (self.__class__.__name__, str(self)) + return f"{self.__class__.__name__}({str(self)})" __setitem__ = __setslice__ = __delitem__ = __delslice__ = _disabled pop = append = extend = remove = sort = insert = _disabled @@ -153,7 +153,7 @@ def __repr__(self) -> str: Return a string representation for this object. """ prepr = pprint_thing(self, escape_chars=("\t", "\r", "\n"), quote_strings=True) - return "%s(%s, dtype='%s')" % (type(self).__name__, prepr, self.dtype) + return f"{type(self).__name__}({prepr}, dtype='{self.dtype}')" @deprecate_kwarg(old_arg_name="v", new_arg_name="value") def searchsorted(self, value, side="left", sorter=None): diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index f3a735511c96b..86398613798be 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -1329,21 +1329,20 @@ def _get_level_number(self, level) -> int: count = self.names.count(level) if (count > 1) and not is_integer(level): raise ValueError( - "The name %s occurs multiple times, use a level number" % level + f"The name {level} occurs multiple times, use a level number" ) try: level = self.names.index(level) except ValueError: if not is_integer(level): - raise KeyError("Level %s not found" % str(level)) + raise KeyError(f"Level {level} not found") elif level < 0: level += self.nlevels if level < 0: orig_level = level - self.nlevels raise IndexError( - "Too many levels: Index has only %d " - "levels, %d is not a valid level number" - % (self.nlevels, orig_level) + f"Too many levels: Index has only {self.nlevels} levels," + f" {orig_level} is not a valid level number" ) # Note: levels are zero-based elif level >= self.nlevels: @@ -2286,8 +2285,8 @@ def reorder_levels(self, order): order = [self._get_level_number(i) for i in order] if len(order) != self.nlevels: raise AssertionError( - "Length of order must be same as " - "number of levels (%d), got %d" % (self.nlevels, len(order)) + f"Length of order must be same as number of levels ({self.nlevels})," + f" got {len(order)}" ) new_levels = [self.levels[i] for i in order] new_codes = [self.codes[i] for i in order] @@ -2599,8 +2598,8 @@ def slice_locs(self, start=None, end=None, step=None, kind=None): def _partial_tup_index(self, tup, side="left"): if len(tup) > self.lexsort_depth: raise UnsortedIndexError( - "Key length (%d) was greater than MultiIndex" - " lexsort depth (%d)" % (len(tup), self.lexsort_depth) + f"Key length ({len(tup)}) was greater than MultiIndex lexsort depth" + f" ({self.lexsort_depth})" ) n = len(tup) @@ -2611,7 +2610,7 @@ def _partial_tup_index(self, tup, side="left"): if lab not in lev: if not lev.is_type_compatible(lib.infer_dtype([lab], skipna=False)): - raise TypeError("Level type mismatch: %s" % lab) + raise TypeError(f"Level type mismatch: {lab}") # short circuit loc = lev.searchsorted(lab, side=side) diff --git a/pandas/core/series.py b/pandas/core/series.py index d771aefb55844..3f69dd53491c1 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -1008,7 +1008,7 @@ def _unpickle_series_compat(self, state): self.name = name else: - raise Exception("cannot unpickle legacy formats -> [%s]" % state) + raise Exception(f"cannot unpickle legacy formats -> [{state}]") # indexers @property @@ -1303,7 +1303,7 @@ def _set_labels(self, key, value): indexer = self.index.get_indexer(key) mask = indexer == -1 if mask.any(): - raise ValueError("%s not contained in the index" % str(key[mask])) + raise ValueError(f"{key[mask]} not contained in the index") self._set_values(indexer, value) def _set_values(self, key, value): @@ -2591,7 +2591,7 @@ def dot(self, other): rvals = np.asarray(other) if lvals.shape[0] != rvals.shape[0]: raise Exception( - "Dot product shape mismatch, %s vs %s" % (lvals.shape, rvals.shape) + f"Dot product shape mismatch, {lvals.shape} vs {rvals.shape}" ) if isinstance(other, ABCDataFrame): @@ -2603,7 +2603,7 @@ def dot(self, other): elif isinstance(rvals, np.ndarray): return np.dot(lvals, rvals) else: # pragma: no cover - raise TypeError("unsupported type: %s" % type(other)) + raise TypeError(f"unsupported type: {type(other)}") def __matmul__(self, other): """ @@ -3083,8 +3083,7 @@ def _try_kind_sort(arr): if is_list_like(ascending): if len(ascending) != 1: raise ValueError( - "Length of ascending (%d) must be 1 " - "for Series" % (len(ascending)) + f"Length of ascending ({len(ascending)}) must be 1 for Series" ) ascending = ascending[0] diff --git a/pandas/core/util/hashing.py b/pandas/core/util/hashing.py index 23c370638b572..fa3582755a202 100644 --- a/pandas/core/util/hashing.py +++ b/pandas/core/util/hashing.py @@ -133,7 +133,7 @@ def hash_pandas_object( h = Series(h, index=obj.index, dtype="uint64", copy=False) else: - raise TypeError("Unexpected type for hashing %s" % type(obj)) + raise TypeError(f"Unexpected type for hashing {type(obj)}") return h diff --git a/pandas/io/clipboard/__init__.py b/pandas/io/clipboard/__init__.py index 40452b41998df..4f690a57893d1 100644 --- a/pandas/io/clipboard/__init__.py +++ b/pandas/io/clipboard/__init__.py @@ -87,7 +87,7 @@ class PyperclipException(RuntimeError): class PyperclipWindowsException(PyperclipException): def __init__(self, message): - message += " (%s)" % ctypes.WinError() + message += f" ({ctypes.WinError()})" super().__init__(message) @@ -599,9 +599,9 @@ def set_clipboard(clipboard): } if clipboard not in clipboard_types: + allowed_clipboard_types = [repr(_) for _ in clipboard_types.keys()] raise ValueError( - "Argument must be one of %s" - % (", ".join([repr(_) for _ in clipboard_types.keys()])) + f"Argument must be one of {', '.join(allowed_clipboard_types)}" ) # Sets pyperclip's copy() and paste() functions: diff --git a/pandas/io/sas/sas_xport.py b/pandas/io/sas/sas_xport.py index 2f2dbdbc76215..9aa8ed1dfeb5d 100644 --- a/pandas/io/sas/sas_xport.py +++ b/pandas/io/sas/sas_xport.py @@ -482,7 +482,7 @@ def read(self, nrows=None): df = pd.DataFrame(index=range(read_lines)) for j, x in enumerate(self.columns): - vec = data["s%d" % j] + vec = data["s" + str(j)] ntype = self.fields[j]["ntype"] if ntype == "numeric": vec = _handle_truncated_float_vec(vec, self.fields[j]["field_length"]) diff --git a/pandas/io/stata.py b/pandas/io/stata.py index d970f2819c3c1..24539057a5db9 100644 --- a/pandas/io/stata.py +++ b/pandas/io/stata.py @@ -2640,7 +2640,7 @@ def _dtype_to_stata_type_117(dtype, column, force_strl): elif dtype == np.int8: return 65530 else: # pragma : no cover - raise NotImplementedError("Data type %s not supported." % dtype) + raise NotImplementedError(f"Data type {dtype} not supported.") def _pad_bytes_new(name, length): diff --git a/pandas/plotting/_matplotlib/converter.py b/pandas/plotting/_matplotlib/converter.py index 946ce8bcec97f..4b0ba2bd423df 100644 --- a/pandas/plotting/_matplotlib/converter.py +++ b/pandas/plotting/_matplotlib/converter.py @@ -429,7 +429,8 @@ def __call__(self): ).format(estimate=estimate, dmin=dmin, dmax=dmax, arg=self.MAXTICKS * 2) ) - freq = "%dL" % self._get_interval() + interval = self._get_interval() + freq = f"{interval}L" tz = self.tz.tzname(None) st = _from_ordinal(dates.date2num(dmin)) # strip tz ed = _from_ordinal(dates.date2num(dmax)) diff --git a/pandas/tests/frame/test_query_eval.py b/pandas/tests/frame/test_query_eval.py index 04d27f4c12c59..cd1bee356ed8e 100644 --- a/pandas/tests/frame/test_query_eval.py +++ b/pandas/tests/frame/test_query_eval.py @@ -991,7 +991,7 @@ def test_query_lex_compare_strings(self, parser, engine): ops = {"<": operator.lt, ">": operator.gt, "<=": operator.le, ">=": operator.ge} for op, func in ops.items(): - res = df.query('X %s "d"' % op, engine=engine, parser=parser) + res = df.query(f'X {op} "d"', engine=engine, parser=parser) expected = df[func(df.X, "d")] tm.assert_frame_equal(res, expected) diff --git a/pandas/tests/internals/test_internals.py b/pandas/tests/internals/test_internals.py index fa7a98c617677..dbd84f15d143c 100644 --- a/pandas/tests/internals/test_internals.py +++ b/pandas/tests/internals/test_internals.py @@ -134,7 +134,7 @@ def create_block(typestr, placement, item_shape=None, num_offset=0): arr = values.sp_values.view() arr += num_offset - 1 else: - raise ValueError('Unsupported typestr: "%s"' % typestr) + raise ValueError(f'Unsupported typestr: "{typestr}"') return make_block(values, placement=placement, ndim=len(shape)) diff --git a/pandas/tests/io/formats/test_style.py b/pandas/tests/io/formats/test_style.py index e5af74bdd4d33..61a3934187bd3 100644 --- a/pandas/tests/io/formats/test_style.py +++ b/pandas/tests/io/formats/test_style.py @@ -390,7 +390,7 @@ def test_applymap_subset_multiindex_code(self): def color_negative_red(val): color = "red" if val < 0 else "black" - return "color: %s" % color + return f"color: {color}" df.loc[pct_subset] df.style.applymap(color_negative_red, subset=pct_subset) diff --git a/pandas/tests/util/test_validate_kwargs.py b/pandas/tests/util/test_validate_kwargs.py index ec9f3948403de..b6241def4e5d6 100644 --- a/pandas/tests/util/test_validate_kwargs.py +++ b/pandas/tests/util/test_validate_kwargs.py @@ -60,9 +60,9 @@ def test_validation(): @pytest.mark.parametrize("name", ["inplace", "copy"]) @pytest.mark.parametrize("value", [1, "True", [1, 2, 3], 5.0]) def test_validate_bool_kwarg_fail(name, value): - msg = 'For argument "%s" expected type bool, received type %s' % ( - name, - type(value).__name__, + msg = ( + f'For argument "{name}" expected type bool,' + f" received type {type(value).__name__}" ) with pytest.raises(ValueError, match=msg): diff --git a/pandas/util/_test_decorators.py b/pandas/util/_test_decorators.py index b516c3d78a11e..b9c165140aaad 100644 --- a/pandas/util/_test_decorators.py +++ b/pandas/util/_test_decorators.py @@ -199,7 +199,7 @@ def skip_if_no(package: str, min_version: Optional[str] = None) -> Callable: def skip_if_np_lt(ver_str, reason=None, *args, **kwds): if reason is None: - reason = "NumPy %s or greater required" % ver_str + reason = f"NumPy {ver_str} or greater required" return pytest.mark.skipif( _np_version < LooseVersion(ver_str), reason=reason, *args, **kwds ) From 527143bda5586bd6a95ad7468b7cd42883a56a6e Mon Sep 17 00:00:00 2001 From: dalgarno <32097481+dalgarno@users.noreply.github.com> Date: Mon, 18 Nov 2019 01:40:47 +0000 Subject: [PATCH 102/185] TST: Add docstrings to arithmetic fixtures (#29441) --- pandas/tests/arithmetic/conftest.py | 36 ++++++++++++++++++++++++++--- 1 file changed, 33 insertions(+), 3 deletions(-) diff --git a/pandas/tests/arithmetic/conftest.py b/pandas/tests/arithmetic/conftest.py index 774ff14398bdb..1f8fdfd671856 100644 --- a/pandas/tests/arithmetic/conftest.py +++ b/pandas/tests/arithmetic/conftest.py @@ -21,7 +21,24 @@ def id_func(x): @pytest.fixture(params=[1, np.array(1, dtype=np.int64)]) def one(request): - # zero-dim integer array behaves like an integer + """ + Several variants of integer value 1. The zero-dim integer array + behaves like an integer. + + This fixture can be used to check that datetimelike indexes handle + addition and subtraction of integers and zero-dimensional arrays + of integers. + + Examples + -------- + >>> dti = pd.date_range('2016-01-01', periods=2, freq='H') + >>> dti + DatetimeIndex(['2016-01-01 00:00:00', '2016-01-01 01:00:00'], + dtype='datetime64[ns]', freq='H') + >>> dti + one + DatetimeIndex(['2016-01-01 01:00:00', '2016-01-01 02:00:00'], + dtype='datetime64[ns]', freq='H') + """ return request.param @@ -40,8 +57,21 @@ def one(request): @pytest.fixture(params=zeros) def zero(request): - # For testing division by (or of) zero for Index with length 5, this - # gives several scalar-zeros and length-5 vector-zeros + """ + Several types of scalar zeros and length 5 vectors of zeros. + + This fixture can be used to check that numeric-dtype indexes handle + division by any zero numeric-dtype. + + Uses vector of length 5 for broadcasting with `numeric_idx` fixture, + which creates numeric-dtype vectors also of length 5. + + Examples + -------- + >>> arr = pd.RangeIndex(5) + >>> arr / zeros + Float64Index([nan, inf, inf, inf, inf], dtype='float64') + """ return request.param From 4988c3fe6c18c1e086a69712203d9fece8302d73 Mon Sep 17 00:00:00 2001 From: Marc Garcia Date: Mon, 18 Nov 2019 04:41:15 +0000 Subject: [PATCH 103/185] CI: Fixing error in code checks in GitHub actions (#29683) --- ci/code_checks.sh | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index d5566c522ac64..edd8fcd418c47 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -190,9 +190,9 @@ if [[ -z "$CHECK" || "$CHECK" == "patterns" ]]; then invgrep -R --include="*.rst" ".. ipython ::" doc/source RET=$(($RET + $?)) ; echo $MSG "DONE" -    MSG='Check for extra blank lines after the class definition' ; echo $MSG -    invgrep -R --include="*.py" --include="*.pyx" -E 'class.*:\n\n( )+"""' . -    RET=$(($RET + $?)) ; echo $MSG "DONE" + MSG='Check for extra blank lines after the class definition' ; echo $MSG + invgrep -R --include="*.py" --include="*.pyx" -E 'class.*:\n\n( )+"""' . + RET=$(($RET + $?)) ; echo $MSG "DONE" MSG='Check that no file in the repo contains trailing whitespaces' ; echo $MSG set -o pipefail From c91683904b42aae92cee61e63d580f25017accb6 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Mon, 18 Nov 2019 08:55:21 +0100 Subject: [PATCH 104/185] REGR: fix DataFrame.agg case with list-like return value (#29632) --- pandas/core/base.py | 4 ++-- pandas/tests/frame/test_apply.py | 17 +++++++++++++++++ 2 files changed, 19 insertions(+), 2 deletions(-) diff --git a/pandas/core/base.py b/pandas/core/base.py index 1c74c977e39bc..e070005c56d7a 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -602,9 +602,9 @@ def _aggregate_multiple_funcs(self, arg, _level, _axis): if not len(results): raise ValueError("no results") - if all(np.ndim(x) > 0 for x in results): + try: return concat(results, keys=keys, axis=1, sort=False) - else: + except TypeError: # we are concatting non-NDFrame objects, # e.g. a list of scalars diff --git a/pandas/tests/frame/test_apply.py b/pandas/tests/frame/test_apply.py index fea50b3b7f75d..ad53fcf29c57d 100644 --- a/pandas/tests/frame/test_apply.py +++ b/pandas/tests/frame/test_apply.py @@ -1259,6 +1259,23 @@ def test_non_callable_aggregates(self): assert result == expected + def test_agg_listlike_result(self): + # GH-29587 user defined function returning list-likes + df = DataFrame( + {"A": [2, 2, 3], "B": [1.5, np.nan, 1.5], "C": ["foo", None, "bar"]} + ) + + def func(group_col): + return list(group_col.dropna().unique()) + + result = df.agg(func) + expected = pd.Series([[2, 3], [1.5], ["foo", "bar"]], index=["A", "B", "C"]) + tm.assert_series_equal(result, expected) + + result = df.agg([func]) + expected = expected.to_frame("func").T + tm.assert_frame_equal(result, expected) + @pytest.mark.parametrize( "df, func, expected", chain( From b6d64d2c01c93745fc56baa3c05987f60c8dbfe7 Mon Sep 17 00:00:00 2001 From: William Ayd Date: Mon, 18 Nov 2019 05:37:17 -0800 Subject: [PATCH 105/185] Extension Module Compat Cleanup (#29666) --- pandas/_libs/src/compat_helper.h | 5 ----- pandas/_libs/src/parser/io.c | 2 +- 2 files changed, 1 insertion(+), 6 deletions(-) diff --git a/pandas/_libs/src/compat_helper.h b/pandas/_libs/src/compat_helper.h index 078069fb48af2..01d5b843d1bb6 100644 --- a/pandas/_libs/src/compat_helper.h +++ b/pandas/_libs/src/compat_helper.h @@ -38,13 +38,8 @@ PANDAS_INLINE int slice_get_indices(PyObject *s, Py_ssize_t *stop, Py_ssize_t *step, Py_ssize_t *slicelength) { -#if PY_VERSION_HEX >= 0x03000000 return PySlice_GetIndicesEx(s, length, start, stop, step, slicelength); -#else - return PySlice_GetIndicesEx((PySliceObject *)s, length, start, - stop, step, slicelength); -#endif // PY_VERSION_HEX } #endif // PANDAS__LIBS_SRC_COMPAT_HELPER_H_ diff --git a/pandas/_libs/src/parser/io.c b/pandas/_libs/src/parser/io.c index 5d73230f32955..aecd4e03664e6 100644 --- a/pandas/_libs/src/parser/io.c +++ b/pandas/_libs/src/parser/io.c @@ -17,7 +17,7 @@ The full license is in the LICENSE file, distributed with this software. #define O_BINARY 0 #endif // O_BINARY -#if PY_VERSION_HEX >= 0x03060000 && defined(_WIN32) +#ifdef _WIN32 #define USE_WIN_UTF16 #include #endif From 17fe9a467581ca39f44c89876ebd0d38b9ca77ea Mon Sep 17 00:00:00 2001 From: Marco Gorelli <33491632+MarcoGorelli@users.noreply.github.com> Date: Mon, 18 Nov 2019 13:44:04 +0000 Subject: [PATCH 106/185] =?UTF-8?q?[BUG]=20Validate=20dtype=20when=20Int64?= =?UTF-8?q?Index,=20UInt64Index,=20or=20Float64Index=20are=20cons=E2=80=A6?= =?UTF-8?q?=20(#29545)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- doc/source/whatsnew/v1.0.0.rst | 1 + pandas/core/indexes/base.py | 2 +- pandas/core/indexes/numeric.py | 21 +++++++++++++++++++- pandas/core/indexes/range.py | 7 ------- pandas/tests/indexes/test_numeric.py | 17 ++++++++++++++++ pandas/tests/indexes/test_range.py | 15 +++++++++++--- pandas/tests/series/indexing/test_numeric.py | 3 +-- 7 files changed, 52 insertions(+), 14 deletions(-) diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index cb68bd0e762c4..30a828064f812 100644 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -343,6 +343,7 @@ Numeric - Bug in :class:`DataFrame` logical operations (`&`, `|`, `^`) not matching :class:`Series` behavior by filling NA values (:issue:`28741`) - Bug in :meth:`DataFrame.interpolate` where specifying axis by name references variable before it is assigned (:issue:`29142`) - Improved error message when using `frac` > 1 and `replace` = False (:issue:`27451`) +- Bug in numeric indexes resulted in it being possible to instantiate an :class:`Int64Index`, :class:`UInt64Index`, or :class:`Float64Index` with an invalid dtype (e.g. datetime-like) (:issue:`29539`) - Bug in :class:`UInt64Index` precision loss while constructing from a list with values in the ``np.uint64`` range (:issue:`29526`) - diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 86664a14e91dd..8978a09825ee9 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -381,7 +381,7 @@ def __new__( pass # Return an actual float index. - return Float64Index(data, copy=copy, dtype=dtype, name=name) + return Float64Index(data, copy=copy, name=name) elif inferred == "string": pass diff --git a/pandas/core/indexes/numeric.py b/pandas/core/indexes/numeric.py index 074cce085fb3c..29f56259dac79 100644 --- a/pandas/core/indexes/numeric.py +++ b/pandas/core/indexes/numeric.py @@ -15,6 +15,8 @@ is_float_dtype, is_integer_dtype, is_scalar, + is_signed_integer_dtype, + is_unsigned_integer_dtype, needs_i8_conversion, pandas_dtype, ) @@ -27,6 +29,7 @@ ) from pandas.core.dtypes.missing import isna +from pandas._typing import Dtype from pandas.core import algorithms import pandas.core.common as com from pandas.core.indexes.base import Index, InvalidIndexError, _index_shared_docs @@ -45,7 +48,7 @@ class NumericIndex(Index): _is_numeric_dtype = True def __new__(cls, data=None, dtype=None, copy=False, name=None, fastpath=None): - + cls._validate_dtype(dtype) if fastpath is not None: warnings.warn( "The 'fastpath' keyword is deprecated, and will be " @@ -80,6 +83,22 @@ def __new__(cls, data=None, dtype=None, copy=False, name=None, fastpath=None): name = data.name return cls._simple_new(subarr, name=name) + @classmethod + def _validate_dtype(cls, dtype: Dtype) -> None: + if dtype is None: + return + validation_metadata = { + "int64index": (is_signed_integer_dtype, "signed integer"), + "uint64index": (is_unsigned_integer_dtype, "unsigned integer"), + "float64index": (is_float_dtype, "float"), + "rangeindex": (is_signed_integer_dtype, "signed integer"), + } + + validation_func, expected = validation_metadata[cls._typ] + if not validation_func(dtype): + msg = f"Incorrect `dtype` passed: expected {expected}, received {dtype}" + raise ValueError(msg) + @Appender(_index_shared_docs["_maybe_cast_slice_bound"]) def _maybe_cast_slice_bound(self, label, side, kind): assert kind in ["ix", "loc", "getitem", None] diff --git a/pandas/core/indexes/range.py b/pandas/core/indexes/range.py index d200ff6a71264..6f677848b1c79 100644 --- a/pandas/core/indexes/range.py +++ b/pandas/core/indexes/range.py @@ -14,7 +14,6 @@ from pandas.core.dtypes.common import ( ensure_platform_int, ensure_python_int, - is_int64_dtype, is_integer, is_integer_dtype, is_list_like, @@ -165,12 +164,6 @@ def _simple_new(cls, values, name=None, dtype=None): # -------------------------------------------------------------------- - @staticmethod - def _validate_dtype(dtype): - """ require dtype to be None or int64 """ - if not (dtype is None or is_int64_dtype(dtype)): - raise TypeError("Invalid to pass a non-int64 dtype to RangeIndex") - @cache_readonly def _constructor(self): """ return the class to use for construction """ diff --git a/pandas/tests/indexes/test_numeric.py b/pandas/tests/indexes/test_numeric.py index e0f7b1d1ade3d..6ee1ce5c4f2ad 100644 --- a/pandas/tests/indexes/test_numeric.py +++ b/pandas/tests/indexes/test_numeric.py @@ -167,6 +167,23 @@ def test_constructor(self): result = Index(np.array([np.nan])) assert pd.isna(result.values).all() + @pytest.mark.parametrize( + "index, dtype", + [ + (pd.Int64Index, "float64"), + (pd.UInt64Index, "categorical"), + (pd.Float64Index, "datetime64"), + (pd.RangeIndex, "float64"), + ], + ) + def test_invalid_dtype(self, index, dtype): + # GH 29539 + with pytest.raises( + ValueError, + match=rf"Incorrect `dtype` passed: expected \w+(?: \w+)?, received {dtype}", + ): + index([1, 2, 3], dtype=dtype) + def test_constructor_invalid(self): # invalid diff --git a/pandas/tests/indexes/test_range.py b/pandas/tests/indexes/test_range.py index fa64e1bacb2e5..b60d3126da1d5 100644 --- a/pandas/tests/indexes/test_range.py +++ b/pandas/tests/indexes/test_range.py @@ -110,7 +110,10 @@ def test_constructor_same(self): result = RangeIndex(index) tm.assert_index_equal(result, index, exact=True) - with pytest.raises(TypeError): + with pytest.raises( + ValueError, + match="Incorrect `dtype` passed: expected signed integer, received float64", + ): RangeIndex(index, dtype="float64") def test_constructor_range(self): @@ -140,7 +143,10 @@ def test_constructor_range(self): expected = RangeIndex(1, 5, 2) tm.assert_index_equal(result, expected, exact=True) - with pytest.raises(TypeError): + with pytest.raises( + ValueError, + match="Incorrect `dtype` passed: expected signed integer, received float64", + ): Index(range(1, 5, 2), dtype="float64") msg = r"^from_range\(\) got an unexpected keyword argument" with pytest.raises(TypeError, match=msg): @@ -178,7 +184,10 @@ def test_constructor_corner(self): RangeIndex(1.1, 10.2, 1.3) # invalid passed type - with pytest.raises(TypeError): + with pytest.raises( + ValueError, + match="Incorrect `dtype` passed: expected signed integer, received float64", + ): RangeIndex(1, 5, dtype="float64") @pytest.mark.parametrize( diff --git a/pandas/tests/series/indexing/test_numeric.py b/pandas/tests/series/indexing/test_numeric.py index bcddcf843df06..60b89c01cc22d 100644 --- a/pandas/tests/series/indexing/test_numeric.py +++ b/pandas/tests/series/indexing/test_numeric.py @@ -86,8 +86,7 @@ def test_get(): 1764.0, 1849.0, 1936.0, - ], - dtype="object", + ] ), ) From 545d1752987f8e325f5ad3d94f0143c453de28cc Mon Sep 17 00:00:00 2001 From: Mateusz Date: Mon, 18 Nov 2019 18:06:25 +0100 Subject: [PATCH 107/185] TST: Add regression test for Series index.map failing (#20990) (#29093) --- pandas/tests/indexes/datetimes/test_datetime.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/pandas/tests/indexes/datetimes/test_datetime.py b/pandas/tests/indexes/datetimes/test_datetime.py index 1776538a15fc2..4a38e3a146c0e 100644 --- a/pandas/tests/indexes/datetimes/test_datetime.py +++ b/pandas/tests/indexes/datetimes/test_datetime.py @@ -456,3 +456,15 @@ def test_to_frame_datetime_tz(self): result = idx.to_frame() expected = DataFrame(idx, index=idx) tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize("name", [None, "name"]) + def test_index_map(self, name): + # see GH20990 + count = 6 + index = pd.date_range("2018-01-01", periods=count, freq="M", name=name).map( + lambda x: (x.year, x.month) + ) + exp_index = pd.MultiIndex.from_product( + ((2018,), range(1, 7)), names=[name, name] + ) + tm.assert_index_equal(index, exp_index) From e246c3b05924ac1fe083565a765ce847fcad3d91 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Mon, 18 Nov 2019 11:44:46 -0800 Subject: [PATCH 108/185] TST: fix DecimalArray._reduce kludges (#29630) * TST: fix DecimalArray._reduce kludges --- pandas/core/groupby/generic.py | 7 ------- pandas/core/groupby/groupby.py | 3 --- pandas/tests/extension/decimal/array.py | 9 ++++++++- pandas/tests/extension/decimal/test_decimal.py | 2 +- 4 files changed, 9 insertions(+), 12 deletions(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 6376dbefcf435..06fd3c1eae006 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -885,13 +885,6 @@ def aggregate(self, func=None, *args, **kwargs): # raised directly by _aggregate_multiple_funcs raise result = self._aggregate_frame(func) - except NotImplementedError as err: - if "decimal does not support skipna=True" in str(err): - # FIXME: kludge for DecimalArray tests - pass - else: - raise - result = self._aggregate_frame(func) else: result.columns = Index( result.columns.levels[0], name=self._selected_obj.columns.name diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 236df4b3854a4..99a4942df4f7f 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -1342,9 +1342,6 @@ def f(self, **kwargs): # raised in _get_cython_function, in some cases can # be trimmed by implementing cython funcs for more dtypes pass - elif "decimal does not support skipna=True" in str(err): - # FIXME: kludge for test_decimal:test_in_numeric_groupby - pass else: raise diff --git a/pandas/tests/extension/decimal/array.py b/pandas/tests/extension/decimal/array.py index 93816e3a8a613..f9ba4b7a8ba16 100644 --- a/pandas/tests/extension/decimal/array.py +++ b/pandas/tests/extension/decimal/array.py @@ -166,7 +166,14 @@ def _concat_same_type(cls, to_concat): def _reduce(self, name, skipna=True, **kwargs): if skipna: - raise NotImplementedError("decimal does not support skipna=True") + # If we don't have any NAs, we can ignore skipna + if self.isna().any(): + other = self[~self.isna()] + return other._reduce(name, **kwargs) + + if name == "sum" and len(self) == 0: + # GH#29630 avoid returning int 0 or np.bool_(False) on old numpy + return decimal.Decimal(0) try: op = getattr(self.data, name) diff --git a/pandas/tests/extension/decimal/test_decimal.py b/pandas/tests/extension/decimal/test_decimal.py index 86724d4d09819..ce819c13c4498 100644 --- a/pandas/tests/extension/decimal/test_decimal.py +++ b/pandas/tests/extension/decimal/test_decimal.py @@ -145,7 +145,7 @@ class TestMissing(BaseDecimal, base.BaseMissingTests): class Reduce: def check_reduce(self, s, op_name, skipna): - if skipna or op_name in ["median", "skew", "kurt"]: + if op_name in ["median", "skew", "kurt"]: with pytest.raises(NotImplementedError): getattr(s, op_name)(skipna=skipna) From e423bac23676948cbcd209012945ba7a20c4a524 Mon Sep 17 00:00:00 2001 From: alimcmaster1 Date: Tue, 19 Nov 2019 04:06:57 +0000 Subject: [PATCH 109/185] CI: Use bash for windows script on azure (#29674) --- ci/azure/windows.yml | 34 +++++++++++++++------------------- 1 file changed, 15 insertions(+), 19 deletions(-) diff --git a/ci/azure/windows.yml b/ci/azure/windows.yml index dfa82819b9826..86807b4010988 100644 --- a/ci/azure/windows.yml +++ b/ci/azure/windows.yml @@ -11,10 +11,12 @@ jobs: py36_np15: ENV_FILE: ci/deps/azure-windows-36.yaml CONDA_PY: "36" + PATTERN: "not slow and not network" py37_np141: ENV_FILE: ci/deps/azure-windows-37.yaml CONDA_PY: "37" + PATTERN: "not slow and not network" steps: - powershell: | @@ -22,38 +24,32 @@ jobs: Write-Host "##vso[task.prependpath]$HOME/miniconda3/bin" displayName: 'Add conda to PATH' - script: conda update -q -n base conda - displayName: Update conda - - script: | - call activate + displayName: 'Update conda' + - bash: | conda env create -q --file ci\\deps\\azure-windows-$(CONDA_PY).yaml displayName: 'Create anaconda environment' - - script: | - call activate pandas-dev - call conda list + - bash: | + source activate pandas-dev + conda list ci\\incremental\\build.cmd displayName: 'Build' - - script: | - call activate pandas-dev - pytest -m "not slow and not network" --junitxml=test-data.xml pandas -n 2 -r sxX --strict --durations=10 %* + - bash: | + source activate pandas-dev + ci/run_tests.sh displayName: 'Test' - task: PublishTestResults@2 inputs: testResultsFiles: 'test-data.xml' testRunTitle: 'Windows-$(CONDA_PY)' - powershell: | - $junitXml = "test-data.xml" - $(Get-Content $junitXml | Out-String) -match 'failures="(.*?)"' - if ($matches[1] -eq 0) - { + $(Get-Content "test-data.xml" | Out-String) -match 'failures="(.*?)"' + if ($matches[1] -eq 0) { Write-Host "No test failures in test-data" - } - else - { - # note that this will produce $LASTEXITCODE=1 - Write-Error "$($matches[1]) tests failed" + } else { + Write-Error "$($matches[1]) tests failed" # will produce $LASTEXITCODE=1 } displayName: 'Check for test failures' - - script: | + - bash: | source activate pandas-dev python ci/print_skipped.py displayName: 'Print skipped tests' From 0485115dbae12c6e8d15fcb951a79c6993be3af2 Mon Sep 17 00:00:00 2001 From: Hubert <39779339+HubertKl@users.noreply.github.com> Date: Tue, 19 Nov 2019 05:23:54 +0100 Subject: [PATCH 110/185] BUG: Issue #29128 Series.var not returning the correct result (#29353) --- doc/source/whatsnew/v1.0.0.rst | 1 + pandas/core/arrays/integer.py | 2 +- pandas/tests/arrays/test_integer.py | 20 ++++++++++++++++++++ 3 files changed, 22 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index 30a828064f812..0027343a13b60 100644 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -342,6 +342,7 @@ Numeric - :class:`DataFrame` flex inequality comparisons methods (:meth:`DataFrame.lt`, :meth:`DataFrame.le`, :meth:`DataFrame.gt`, :meth: `DataFrame.ge`) with object-dtype and ``complex`` entries failing to raise ``TypeError`` like their :class:`Series` counterparts (:issue:`28079`) - Bug in :class:`DataFrame` logical operations (`&`, `|`, `^`) not matching :class:`Series` behavior by filling NA values (:issue:`28741`) - Bug in :meth:`DataFrame.interpolate` where specifying axis by name references variable before it is assigned (:issue:`29142`) +- Bug in :meth:`Series.var` not computing the right value with a nullable integer dtype series not passing through ddof argument (:issue:`29128`) - Improved error message when using `frac` > 1 and `replace` = False (:issue:`27451`) - Bug in numeric indexes resulted in it being possible to instantiate an :class:`Int64Index`, :class:`UInt64Index`, or :class:`Float64Index` with an invalid dtype (e.g. datetime-like) (:issue:`29539`) - Bug in :class:`UInt64Index` precision loss while constructing from a list with values in the ``np.uint64`` range (:issue:`29526`) diff --git a/pandas/core/arrays/integer.py b/pandas/core/arrays/integer.py index e167e556b244a..af7755fb1373d 100644 --- a/pandas/core/arrays/integer.py +++ b/pandas/core/arrays/integer.py @@ -647,7 +647,7 @@ def _reduce(self, name, skipna=True, **kwargs): data[mask] = self._na_value op = getattr(nanops, "nan" + name) - result = op(data, axis=0, skipna=skipna, mask=mask) + result = op(data, axis=0, skipna=skipna, mask=mask, **kwargs) # if we have a boolean op, don't coerce if name in ["any", "all"]: diff --git a/pandas/tests/arrays/test_integer.py b/pandas/tests/arrays/test_integer.py index 793de66767cc3..025366e5b210b 100644 --- a/pandas/tests/arrays/test_integer.py +++ b/pandas/tests/arrays/test_integer.py @@ -829,6 +829,26 @@ def test_arrow_array(data): assert arr.equals(expected) +@pytest.mark.parametrize( + "pandasmethname, kwargs", + [ + ("var", {"ddof": 0}), + ("var", {"ddof": 1}), + ("kurtosis", {}), + ("skew", {}), + ("sem", {}), + ], +) +def test_stat_method(pandasmethname, kwargs): + s = pd.Series(data=[1, 2, 3, 4, 5, 6, np.nan, np.nan], dtype="Int64") + pandasmeth = getattr(s, pandasmethname) + result = pandasmeth(**kwargs) + s2 = pd.Series(data=[1, 2, 3, 4, 5, 6], dtype="Int64") + pandasmeth = getattr(s2, pandasmethname) + expected = pandasmeth(**kwargs) + assert expected == result + + # TODO(jreback) - these need testing / are broken # shift From d134b476da04a9e0427cc2bea0694fefc3b054e4 Mon Sep 17 00:00:00 2001 From: Julia Signell Date: Mon, 18 Nov 2019 23:59:10 -0500 Subject: [PATCH 111/185] ENH: When using another plotting backend, minimize pre-processing (#28647) --- pandas/plotting/_core.py | 5 +++++ pandas/tests/plotting/test_backend.py | 8 ++++++++ 2 files changed, 13 insertions(+) diff --git a/pandas/plotting/_core.py b/pandas/plotting/_core.py index 6c3d70271fc12..da1e06dccc65d 100644 --- a/pandas/plotting/_core.py +++ b/pandas/plotting/_core.py @@ -776,6 +776,11 @@ def __call__(self, *args, **kwargs): ) kind = self._kind_aliases.get(kind, kind) + + # when using another backend, get out of the way + if plot_backend.__name__ != "pandas.plotting._matplotlib": + return plot_backend.plot(self._parent, x=x, y=y, kind=kind, **kwargs) + if kind not in self._all_kinds: raise ValueError("{} is not a valid plot kind".format(kind)) diff --git a/pandas/tests/plotting/test_backend.py b/pandas/tests/plotting/test_backend.py index c84b78c79e771..9025f8c361a82 100644 --- a/pandas/tests/plotting/test_backend.py +++ b/pandas/tests/plotting/test_backend.py @@ -94,3 +94,11 @@ def test_setting_backend_without_plot_raises(): def test_no_matplotlib_ok(): with pytest.raises(ImportError): pandas.plotting._core._get_plot_backend("matplotlib") + + +def test_extra_kinds_ok(monkeypatch, restore_backend): + # https://github.com/pandas-dev/pandas/pull/28647 + monkeypatch.setitem(sys.modules, "pandas_dummy_backend", dummy_backend) + pandas.set_option("plotting.backend", "pandas_dummy_backend") + df = pandas.DataFrame({"A": [1, 2, 3]}) + df.plot(kind="not a real kind") From a39d9c8a5901be8bf151bb65f7f1dd6896a51acb Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Tue, 19 Nov 2019 07:19:15 -0600 Subject: [PATCH 112/185] CI: Use conda for 3.8 build (#29696) --- .travis.yml | 8 +++----- ci/build38.sh | 19 ------------------- ci/deps/travis-38.yaml | 16 ++++++++++++++++ ci/setup_env.sh | 5 ----- 4 files changed, 19 insertions(+), 29 deletions(-) delete mode 100644 ci/build38.sh create mode 100644 ci/deps/travis-38.yaml diff --git a/.travis.yml b/.travis.yml index 048736e4bf1d0..0acd386eea9ed 100644 --- a/.travis.yml +++ b/.travis.yml @@ -30,11 +30,9 @@ matrix: - python: 3.5 include: - - dist: bionic - # 18.04 - python: 3.8.0 + - dist: trusty env: - - JOB="3.8-dev" PATTERN="(not slow and not network)" + - JOB="3.8" ENV_FILE="ci/deps/travis-38.yaml" PATTERN="(not slow and not network)" - dist: trusty env: @@ -88,7 +86,7 @@ install: script: - echo "script start" - echo "$JOB" - - if [ "$JOB" != "3.8-dev" ]; then source activate pandas-dev; fi + - source activate pandas-dev - ci/run_tests.sh after_script: diff --git a/ci/build38.sh b/ci/build38.sh deleted file mode 100644 index 66eb5cad38475..0000000000000 --- a/ci/build38.sh +++ /dev/null @@ -1,19 +0,0 @@ -#!/bin/bash -e -# Special build for python3.8 until numpy puts its own wheels up - -sudo apt-get install build-essential gcc xvfb -pip install --no-deps -U pip wheel setuptools -pip install python-dateutil pytz cython pytest pytest-xdist hypothesis - -# Possible alternative for getting numpy: -pip install --pre -f https://7933911d6844c6c53a7d-47bd50c35cd79bd838daf386af554a83.ssl.cf2.rackcdn.com/ numpy - -python setup.py build_ext -inplace -python -m pip install -v --no-build-isolation -e . - -python -c "import sys; print(sys.version_info)" -python -c "import pandas as pd" -python -c "import hypothesis" - -# TODO: Is there anything else in setup_env that we really want to do? -# ci/setup_env.sh diff --git a/ci/deps/travis-38.yaml b/ci/deps/travis-38.yaml new file mode 100644 index 0000000000000..bd62ffa9248fe --- /dev/null +++ b/ci/deps/travis-38.yaml @@ -0,0 +1,16 @@ +name: pandas-dev +channels: + - defaults + - conda-forge +dependencies: + - python=3.8.* + - cython>=0.29.13 + - numpy + - python-dateutil + - nomkl + - pytz + # universal + - pytest>=5.0.0 + - pytest-xdist>=1.29.0 + - hypothesis>=3.58.0 + - pip diff --git a/ci/setup_env.sh b/ci/setup_env.sh index 0e8d6fb7cd35a..3d79c0cfd7000 100755 --- a/ci/setup_env.sh +++ b/ci/setup_env.sh @@ -1,10 +1,5 @@ #!/bin/bash -e -if [ "$JOB" == "3.8-dev" ]; then - /bin/bash ci/build38.sh - exit 0 -fi - # edit the locale file if needed if [ -n "$LOCALE_OVERRIDE" ]; then echo "Adding locale to the first line of pandas/__init__.py" From b5371e4c5727e927f1e6326429b1f7e94def4acd Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Tue, 19 Nov 2019 05:20:14 -0800 Subject: [PATCH 113/185] use _extract_result (#29702) --- pandas/_libs/reduction.pyx | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) diff --git a/pandas/_libs/reduction.pyx b/pandas/_libs/reduction.pyx index f5521b94b6c33..ea54b00cf5be4 100644 --- a/pandas/_libs/reduction.pyx +++ b/pandas/_libs/reduction.pyx @@ -135,9 +135,8 @@ cdef class Reducer: else: res = self.f(chunk) - if (not _is_sparse_array(res) and hasattr(res, 'values') - and util.is_array(res.values)): - res = res.values + # TODO: reason for not squeezing here? + res = _extract_result(res, squeeze=False) if i == 0: # On the first pass, we check the output shape to see # if this looks like a reduction. @@ -402,18 +401,17 @@ cdef class SeriesGrouper(_BaseGrouper): return result, counts -cdef inline _extract_result(object res): +cdef inline _extract_result(object res, bint squeeze=True): """ extract the result object, it might be a 0-dim ndarray or a len-1 0-dim, or a scalar """ if (not _is_sparse_array(res) and hasattr(res, 'values') and util.is_array(res.values)): res = res.values - if not np.isscalar(res): - if util.is_array(res): - if res.ndim == 0: - res = res.item() - elif res.ndim == 1 and len(res) == 1: - res = res[0] + if util.is_array(res): + if res.ndim == 0: + res = res.item() + elif squeeze and res.ndim == 1 and len(res) == 1: + res = res[0] return res From 7a331c94005ffc0600a9a9dbcc294dae8cae865f Mon Sep 17 00:00:00 2001 From: William Ayd Date: Tue, 19 Nov 2019 05:26:58 -0800 Subject: [PATCH 114/185] Removed compat_helper.h (#29693) --- pandas/_libs/internals.pyx | 17 ++++-------- pandas/_libs/src/compat_helper.h | 45 -------------------------------- setup.py | 9 +++---- 3 files changed, 8 insertions(+), 63 deletions(-) delete mode 100644 pandas/_libs/src/compat_helper.h diff --git a/pandas/_libs/internals.pyx b/pandas/_libs/internals.pyx index 8e61a772912af..ba108c4524b9c 100644 --- a/pandas/_libs/internals.pyx +++ b/pandas/_libs/internals.pyx @@ -1,7 +1,7 @@ import cython from cython import Py_ssize_t -from cpython.object cimport PyObject +from cpython.slice cimport PySlice_GetIndicesEx cdef extern from "Python.h": Py_ssize_t PY_SSIZE_T_MAX @@ -9,13 +9,6 @@ cdef extern from "Python.h": import numpy as np from numpy cimport int64_t -cdef extern from "compat_helper.h": - cdef int slice_get_indices(PyObject* s, Py_ssize_t length, - Py_ssize_t *start, Py_ssize_t *stop, - Py_ssize_t *step, - Py_ssize_t *slicelength) except -1 - - from pandas._libs.algos import ensure_int64 @@ -258,8 +251,8 @@ cpdef Py_ssize_t slice_len( if slc is None: raise TypeError("slc must be slice") - slice_get_indices(slc, objlen, - &start, &stop, &step, &length) + PySlice_GetIndicesEx(slc, objlen, + &start, &stop, &step, &length) return length @@ -278,8 +271,8 @@ cdef slice_get_indices_ex(slice slc, Py_ssize_t objlen=PY_SSIZE_T_MAX): if slc is None: raise TypeError("slc should be a slice") - slice_get_indices(slc, objlen, - &start, &stop, &step, &length) + PySlice_GetIndicesEx(slc, objlen, + &start, &stop, &step, &length) return start, stop, step, length diff --git a/pandas/_libs/src/compat_helper.h b/pandas/_libs/src/compat_helper.h deleted file mode 100644 index 01d5b843d1bb6..0000000000000 --- a/pandas/_libs/src/compat_helper.h +++ /dev/null @@ -1,45 +0,0 @@ -/* -Copyright (c) 2016, PyData Development Team -All rights reserved. - -Distributed under the terms of the BSD Simplified License. - -The full license is in the LICENSE file, distributed with this software. -*/ - -#ifndef PANDAS__LIBS_SRC_COMPAT_HELPER_H_ -#define PANDAS__LIBS_SRC_COMPAT_HELPER_H_ - -#include "Python.h" -#include "inline_helper.h" - -/* -PySlice_GetIndicesEx changes signature in PY3 -but 3.6.1 in particular changes the behavior of this function slightly -https://bugs.python.org/issue27867 - - -In 3.6.1 PySlice_GetIndicesEx was changed to a macro -inadvertently breaking ABI compat. For now, undefing -the macro, which restores compat. -https://github.com/pandas-dev/pandas/issues/15961 -https://bugs.python.org/issue29943 -*/ - -#ifndef PYPY_VERSION -# if PY_VERSION_HEX < 0x03070000 && defined(PySlice_GetIndicesEx) -# undef PySlice_GetIndicesEx -# endif // PY_VERSION_HEX -#endif // PYPY_VERSION - -PANDAS_INLINE int slice_get_indices(PyObject *s, - Py_ssize_t length, - Py_ssize_t *start, - Py_ssize_t *stop, - Py_ssize_t *step, - Py_ssize_t *slicelength) { - return PySlice_GetIndicesEx(s, length, start, stop, - step, slicelength); -} - -#endif // PANDAS__LIBS_SRC_COMPAT_HELPER_H_ diff --git a/setup.py b/setup.py index a7bc7a333cdd6..545765ecb114d 100755 --- a/setup.py +++ b/setup.py @@ -83,10 +83,7 @@ def is_platform_mac(): _pxi_dep_template = { - "algos": [ - "_libs/algos_common_helper.pxi.in", - "_libs/algos_take_helper.pxi.in", - ], + "algos": ["_libs/algos_common_helper.pxi.in", "_libs/algos_take_helper.pxi.in"], "hashtable": [ "_libs/hashtable_class_helper.pxi.in", "_libs/hashtable_func_helper.pxi.in", @@ -544,7 +541,7 @@ def srcpath(name=None, suffix=".pyx", subdir="src"): ts_include = ["pandas/_libs/tslibs/src", "pandas/_libs/tslibs"] -lib_depends = ["pandas/_libs/src/parse_helper.h", "pandas/_libs/src/compat_helper.h"] +lib_depends = ["pandas/_libs/src/parse_helper.h"] np_datetime_headers = [ "pandas/_libs/tslibs/src/datetime/np_datetime.h", @@ -823,5 +820,5 @@ def srcpath(name=None, suffix=".pyx", subdir="src"): entry_points={ "pandas_plotting_backends": ["matplotlib = pandas:plotting._matplotlib"] }, - **setuptools_kwargs + **setuptools_kwargs, ) From 07e6b9db8ab464f90608b8dea7210ba0481a1982 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Tue, 19 Nov 2019 05:30:17 -0800 Subject: [PATCH 115/185] REF: align transform logic flow (#29672) --- pandas/core/groupby/generic.py | 83 ++++++++++++++++++---------------- 1 file changed, 43 insertions(+), 40 deletions(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 06fd3c1eae006..31563e4bccbb7 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -394,35 +394,39 @@ def _aggregate_named(self, func, *args, **kwargs): def transform(self, func, *args, **kwargs): func = self._get_cython_func(func) or func - if isinstance(func, str): - if not (func in base.transform_kernel_whitelist): - msg = "'{func}' is not a valid function name for transform(name)" - raise ValueError(msg.format(func=func)) - if func in base.cythonized_kernels: - # cythonized transform or canned "agg+broadcast" - return getattr(self, func)(*args, **kwargs) - else: - # If func is a reduction, we need to broadcast the - # result to the whole group. Compute func result - # and deal with possible broadcasting below. - return self._transform_fast( - lambda: getattr(self, func)(*args, **kwargs), func - ) + if not isinstance(func, str): + return self._transform_general(func, *args, **kwargs) + + elif func not in base.transform_kernel_whitelist: + msg = f"'{func}' is not a valid function name for transform(name)" + raise ValueError(msg) + elif func in base.cythonized_kernels: + # cythonized transform or canned "agg+broadcast" + return getattr(self, func)(*args, **kwargs) - # reg transform + # If func is a reduction, we need to broadcast the + # result to the whole group. Compute func result + # and deal with possible broadcasting below. + result = getattr(self, func)(*args, **kwargs) + return self._transform_fast(result, func) + + def _transform_general(self, func, *args, **kwargs): + """ + Transform with a non-str `func`. + """ klass = self._selected_obj.__class__ + results = [] - wrapper = lambda x: func(x, *args, **kwargs) for name, group in self: object.__setattr__(group, "name", name) - res = wrapper(group) + res = func(group, *args, **kwargs) if isinstance(res, (ABCDataFrame, ABCSeries)): res = res._values indexer = self._get_index(name) - s = klass(res, indexer) - results.append(s) + ser = klass(res, indexer) + results.append(ser) # check for empty "results" to avoid concat ValueError if results: @@ -433,7 +437,7 @@ def transform(self, func, *args, **kwargs): result = Series() # we will only try to coerce the result type if - # we have a numeric dtype, as these are *always* udfs + # we have a numeric dtype, as these are *always* user-defined funcs # the cython take a different path (and casting) dtype = self._selected_obj.dtype if is_numeric_dtype(dtype): @@ -443,17 +447,14 @@ def transform(self, func, *args, **kwargs): result.index = self._selected_obj.index return result - def _transform_fast(self, func, func_nm) -> Series: + def _transform_fast(self, result, func_nm: str) -> Series: """ fast version of transform, only applicable to builtin/cythonizable functions """ - if isinstance(func, str): - func = getattr(self, func) - ids, _, ngroup = self.grouper.group_info cast = self._transform_should_cast(func_nm) - out = algorithms.take_1d(func()._values, ids) + out = algorithms.take_1d(result._values, ids) if cast: out = self._try_cast(out, self.obj) return Series(out, index=self.obj.index, name=self.obj.name) @@ -1333,21 +1334,21 @@ def transform(self, func, *args, **kwargs): # optimized transforms func = self._get_cython_func(func) or func - if isinstance(func, str): - if not (func in base.transform_kernel_whitelist): - msg = "'{func}' is not a valid function name for transform(name)" - raise ValueError(msg.format(func=func)) - if func in base.cythonized_kernels: - # cythonized transformation or canned "reduction+broadcast" - return getattr(self, func)(*args, **kwargs) - else: - # If func is a reduction, we need to broadcast the - # result to the whole group. Compute func result - # and deal with possible broadcasting below. - result = getattr(self, func)(*args, **kwargs) - else: + if not isinstance(func, str): return self._transform_general(func, *args, **kwargs) + elif func not in base.transform_kernel_whitelist: + msg = f"'{func}' is not a valid function name for transform(name)" + raise ValueError(msg) + elif func in base.cythonized_kernels: + # cythonized transformation or canned "reduction+broadcast" + return getattr(self, func)(*args, **kwargs) + + # If func is a reduction, we need to broadcast the + # result to the whole group. Compute func result + # and deal with possible broadcasting below. + result = getattr(self, func)(*args, **kwargs) + # a reduction transform if not isinstance(result, DataFrame): return self._transform_general(func, *args, **kwargs) @@ -1358,9 +1359,9 @@ def transform(self, func, *args, **kwargs): if not result.columns.equals(obj.columns): return self._transform_general(func, *args, **kwargs) - return self._transform_fast(result, obj, func) + return self._transform_fast(result, func) - def _transform_fast(self, result: DataFrame, obj: DataFrame, func_nm) -> DataFrame: + def _transform_fast(self, result: DataFrame, func_nm: str) -> DataFrame: """ Fast transform path for aggregations """ @@ -1368,6 +1369,8 @@ def _transform_fast(self, result: DataFrame, obj: DataFrame, func_nm) -> DataFra # try casting data to original dtype cast = self._transform_should_cast(func_nm) + obj = self._obj_with_exclusions + # for each col, reshape to to size of original frame # by take operation ids, _, ngroup = self.grouper.group_info From 30059081e946a2020d08d49bf4fa7b771d10089a Mon Sep 17 00:00:00 2001 From: Mateusz Date: Tue, 19 Nov 2019 14:31:12 +0100 Subject: [PATCH 116/185] BUG: resolved problem with DataFrame.equals() (#28839) (#29657) --- doc/source/whatsnew/v1.0.0.rst | 1 + pandas/core/internals/managers.py | 6 +++--- pandas/tests/internals/test_internals.py | 7 +++++++ 3 files changed, 11 insertions(+), 3 deletions(-) diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index 0027343a13b60..c24e0b057afd7 100644 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -452,6 +452,7 @@ Reshaping - Fix to ensure all int dtypes can be used in :func:`merge_asof` when using a tolerance value. Previously every non-int64 type would raise an erroneous ``MergeError`` (:issue:`28870`). - Better error message in :func:`get_dummies` when `columns` isn't a list-like value (:issue:`28383`) - Bug :meth:`Series.pct_change` where supplying an anchored frequency would throw a ValueError (:issue:`28664`) +- Bug where :meth:`DataFrame.equals` returned True incorrectly in some cases when two DataFrames had the same columns in different orders (:issue:`28839`) Sparse ^^^^^^ diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index 8a9410c076f9b..0e6ba8a2c2a6a 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -1394,12 +1394,12 @@ def equals(self, other): if len(self.blocks) != len(other.blocks): return False - # canonicalize block order, using a tuple combining the type - # name and then mgr_locs because there might be unconsolidated + # canonicalize block order, using a tuple combining the mgr_locs + # then type name because there might be unconsolidated # blocks (say, Categorical) which can only be distinguished by # the iteration order def canonicalize(block): - return (block.dtype.name, block.mgr_locs.as_array.tolist()) + return (block.mgr_locs.as_array.tolist(), block.dtype.name) self_blocks = sorted(self.blocks, key=canonicalize) other_blocks = sorted(other.blocks, key=canonicalize) diff --git a/pandas/tests/internals/test_internals.py b/pandas/tests/internals/test_internals.py index dbd84f15d143c..c98bdab0df766 100644 --- a/pandas/tests/internals/test_internals.py +++ b/pandas/tests/internals/test_internals.py @@ -1297,3 +1297,10 @@ def test_make_block_no_pandas_array(): result = make_block(arr.to_numpy(), slice(len(arr)), dtype=arr.dtype) assert result.is_integer is True assert result.is_extension is False + + +def test_dataframe_not_equal(): + # see GH28839 + df1 = pd.DataFrame({"a": [1, 2], "b": ["s", "d"]}) + df2 = pd.DataFrame({"a": ["s", "d"], "b": [1, 2]}) + assert df1.equals(df2) is False From a92109231c9f24e31aad0ade96f51aa374cbf1f2 Mon Sep 17 00:00:00 2001 From: Gabriel Corona Date: Tue, 19 Nov 2019 14:42:58 +0100 Subject: [PATCH 117/185] BUG: fix dtype for .resample().size()/count() of empty series/dataframe (#28427) (#28459) --- doc/source/whatsnew/v1.0.0.rst | 1 + pandas/core/resample.py | 35 ++++++++++++------- pandas/tests/resample/test_base.py | 54 ++++++++++++++++++++++++++++++ 3 files changed, 78 insertions(+), 12 deletions(-) diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index c24e0b057afd7..54e54751a1f89 100644 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -433,6 +433,7 @@ Groupby/resample/rolling - - Bug in :meth:`DataFrame.groupby` with multiple groups where an ``IndexError`` would be raised if any group contained all NA values (:issue:`20519`) +- Bug in :meth:`pandas.core.resample.Resampler.size` and :meth:`pandas.core.resample.Resampler.count` returning wrong dtype when used with an empty series or dataframe (:issue:`28427`) - Bug in :meth:`DataFrame.rolling` not allowing for rolling over datetimes when ``axis=1`` (:issue: `28192`) - Bug in :meth:`DataFrame.rolling` not allowing rolling over multi-index levels (:issue: `15584`). - Bug in :meth:`DataFrame.rolling` not allowing rolling on monotonic decreasing time indexes (:issue: `19248`). diff --git a/pandas/core/resample.py b/pandas/core/resample.py index 5bb0716728778..81ec4f45ec8e1 100644 --- a/pandas/core/resample.py +++ b/pandas/core/resample.py @@ -869,13 +869,32 @@ def var(self, ddof=1, *args, **kwargs): @Appender(GroupBy.size.__doc__) def size(self): - # It's a special case as higher level does return - # a copy of 0-len objects. GH14962 result = self._downsample("size") - if not len(self.ax) and isinstance(self._selected_obj, ABCDataFrame): + if not len(self.ax): from pandas import Series - result = Series([], index=result.index, dtype="int64") + if self._selected_obj.ndim == 1: + name = self._selected_obj.name + else: + name = None + result = Series([], index=result.index, dtype="int64", name=name) + return result + + @Appender(GroupBy.count.__doc__) + def count(self): + result = self._downsample("count") + if not len(self.ax): + if self._selected_obj.ndim == 1: + result = self._selected_obj.__class__( + [], index=result.index, dtype="int64", name=self._selected_obj.name + ) + else: + from pandas import DataFrame + + result = DataFrame( + [], index=result.index, columns=result.columns, dtype="int64" + ) + return result def quantile(self, q=0.5, **kwargs): @@ -923,14 +942,6 @@ def g(self, _method=method, *args, **kwargs): g.__doc__ = getattr(GroupBy, method).__doc__ setattr(Resampler, method, g) -# groupby & aggregate methods -for method in ["count"]: - - def h(self, _method=method): - return self._downsample(_method) - - h.__doc__ = getattr(GroupBy, method).__doc__ - setattr(Resampler, method, h) # series only methods for method in ["nunique"]: diff --git a/pandas/tests/resample/test_base.py b/pandas/tests/resample/test_base.py index dc72800227c0e..161581e16b6fe 100644 --- a/pandas/tests/resample/test_base.py +++ b/pandas/tests/resample/test_base.py @@ -112,6 +112,22 @@ def test_resample_empty_series(freq, empty_series, resample_method): tm.assert_series_equal(result, expected, check_dtype=False) +@all_ts +@pytest.mark.parametrize("freq", ["M", "D", "H"]) +@pytest.mark.parametrize("resample_method", ["count", "size"]) +def test_resample_count_empty_series(freq, empty_series, resample_method): + # GH28427 + result = getattr(empty_series.resample(freq), resample_method)() + + if isinstance(empty_series.index, PeriodIndex): + index = empty_series.index.asfreq(freq=freq) + else: + index = empty_series.index._shallow_copy(freq=freq) + expected = pd.Series([], dtype="int64", index=index, name=empty_series.name) + + tm.assert_series_equal(result, expected) + + @all_ts @pytest.mark.parametrize("freq", ["M", "D", "H"]) def test_resample_empty_dataframe(empty_frame, freq, resample_method): @@ -136,6 +152,44 @@ def test_resample_empty_dataframe(empty_frame, freq, resample_method): # test size for GH13212 (currently stays as df) +@all_ts +@pytest.mark.parametrize("freq", ["M", "D", "H"]) +def test_resample_count_empty_dataframe(freq, empty_frame): + # GH28427 + + empty_frame = empty_frame.copy() + empty_frame["a"] = [] + + result = empty_frame.resample(freq).count() + + if isinstance(empty_frame.index, PeriodIndex): + index = empty_frame.index.asfreq(freq=freq) + else: + index = empty_frame.index._shallow_copy(freq=freq) + expected = pd.DataFrame({"a": []}, dtype="int64", index=index) + + tm.assert_frame_equal(result, expected) + + +@all_ts +@pytest.mark.parametrize("freq", ["M", "D", "H"]) +def test_resample_size_empty_dataframe(freq, empty_frame): + # GH28427 + + empty_frame = empty_frame.copy() + empty_frame["a"] = [] + + result = empty_frame.resample(freq).size() + + if isinstance(empty_frame.index, PeriodIndex): + index = empty_frame.index.asfreq(freq=freq) + else: + index = empty_frame.index._shallow_copy(freq=freq) + expected = pd.Series([], dtype="int64", index=index) + + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize("index", tm.all_timeseries_index_generator(0)) @pytest.mark.parametrize("dtype", [np.float, np.int, np.object, "datetime64[ns]"]) def test_resample_empty_dtypes(index, dtype, resample_method): From 6a74261eca38e94dd31ff69bb69fdf544d096c7c Mon Sep 17 00:00:00 2001 From: Prakhar Pandey Date: Tue, 19 Nov 2019 20:37:01 +0530 Subject: [PATCH 118/185] TST: Split pandas/tests/frame/test_indexing into a directory (#29544) (#29694) --- .../tests/frame/indexing/test_categorical.py | 388 +++++++ pandas/tests/frame/indexing/test_datetime.py | 62 + .../frame/{ => indexing}/test_indexing.py | 1013 +---------------- pandas/tests/frame/indexing/test_where.py | 582 ++++++++++ 4 files changed, 1033 insertions(+), 1012 deletions(-) create mode 100644 pandas/tests/frame/indexing/test_categorical.py create mode 100644 pandas/tests/frame/indexing/test_datetime.py rename pandas/tests/frame/{ => indexing}/test_indexing.py (73%) create mode 100644 pandas/tests/frame/indexing/test_where.py diff --git a/pandas/tests/frame/indexing/test_categorical.py b/pandas/tests/frame/indexing/test_categorical.py new file mode 100644 index 0000000000000..b595e48797d41 --- /dev/null +++ b/pandas/tests/frame/indexing/test_categorical.py @@ -0,0 +1,388 @@ +import numpy as np +import pytest + +from pandas.core.dtypes.dtypes import CategoricalDtype + +import pandas as pd +from pandas import Categorical, DataFrame, Index, Series +import pandas.util.testing as tm + + +class TestDataFrameIndexingCategorical: + def test_assignment(self): + # assignment + df = DataFrame( + {"value": np.array(np.random.randint(0, 10000, 100), dtype="int32")} + ) + labels = Categorical( + ["{0} - {1}".format(i, i + 499) for i in range(0, 10000, 500)] + ) + + df = df.sort_values(by=["value"], ascending=True) + s = pd.cut(df.value, range(0, 10500, 500), right=False, labels=labels) + d = s.values + df["D"] = d + str(df) + + result = df.dtypes + expected = Series( + [np.dtype("int32"), CategoricalDtype(categories=labels, ordered=False)], + index=["value", "D"], + ) + tm.assert_series_equal(result, expected) + + df["E"] = s + str(df) + + result = df.dtypes + expected = Series( + [ + np.dtype("int32"), + CategoricalDtype(categories=labels, ordered=False), + CategoricalDtype(categories=labels, ordered=False), + ], + index=["value", "D", "E"], + ) + tm.assert_series_equal(result, expected) + + result1 = df["D"] + result2 = df["E"] + tm.assert_categorical_equal(result1._data._block.values, d) + + # sorting + s.name = "E" + tm.assert_series_equal(result2.sort_index(), s.sort_index()) + + cat = Categorical([1, 2, 3, 10], categories=[1, 2, 3, 4, 10]) + df = DataFrame(Series(cat)) + + def test_assigning_ops(self): + # systematically test the assigning operations: + # for all slicing ops: + # for value in categories and value not in categories: + + # - assign a single value -> exp_single_cats_value + + # - assign a complete row (mixed values) -> exp_single_row + + # assign multiple rows (mixed values) (-> array) -> exp_multi_row + + # assign a part of a column with dtype == categorical -> + # exp_parts_cats_col + + # assign a part of a column with dtype != categorical -> + # exp_parts_cats_col + + cats = Categorical(["a", "a", "a", "a", "a", "a", "a"], categories=["a", "b"]) + idx = Index(["h", "i", "j", "k", "l", "m", "n"]) + values = [1, 1, 1, 1, 1, 1, 1] + orig = DataFrame({"cats": cats, "values": values}, index=idx) + + # the expected values + # changed single row + cats1 = Categorical(["a", "a", "b", "a", "a", "a", "a"], categories=["a", "b"]) + idx1 = Index(["h", "i", "j", "k", "l", "m", "n"]) + values1 = [1, 1, 2, 1, 1, 1, 1] + exp_single_row = DataFrame({"cats": cats1, "values": values1}, index=idx1) + + # changed multiple rows + cats2 = Categorical(["a", "a", "b", "b", "a", "a", "a"], categories=["a", "b"]) + idx2 = Index(["h", "i", "j", "k", "l", "m", "n"]) + values2 = [1, 1, 2, 2, 1, 1, 1] + exp_multi_row = DataFrame({"cats": cats2, "values": values2}, index=idx2) + + # changed part of the cats column + cats3 = Categorical(["a", "a", "b", "b", "a", "a", "a"], categories=["a", "b"]) + idx3 = Index(["h", "i", "j", "k", "l", "m", "n"]) + values3 = [1, 1, 1, 1, 1, 1, 1] + exp_parts_cats_col = DataFrame({"cats": cats3, "values": values3}, index=idx3) + + # changed single value in cats col + cats4 = Categorical(["a", "a", "b", "a", "a", "a", "a"], categories=["a", "b"]) + idx4 = Index(["h", "i", "j", "k", "l", "m", "n"]) + values4 = [1, 1, 1, 1, 1, 1, 1] + exp_single_cats_value = DataFrame( + {"cats": cats4, "values": values4}, index=idx4 + ) + + # iloc + # ############### + # - assign a single value -> exp_single_cats_value + df = orig.copy() + df.iloc[2, 0] = "b" + tm.assert_frame_equal(df, exp_single_cats_value) + + df = orig.copy() + df.iloc[df.index == "j", 0] = "b" + tm.assert_frame_equal(df, exp_single_cats_value) + + # - assign a single value not in the current categories set + with pytest.raises(ValueError): + df = orig.copy() + df.iloc[2, 0] = "c" + + # - assign a complete row (mixed values) -> exp_single_row + df = orig.copy() + df.iloc[2, :] = ["b", 2] + tm.assert_frame_equal(df, exp_single_row) + + # - assign a complete row (mixed values) not in categories set + with pytest.raises(ValueError): + df = orig.copy() + df.iloc[2, :] = ["c", 2] + + # - assign multiple rows (mixed values) -> exp_multi_row + df = orig.copy() + df.iloc[2:4, :] = [["b", 2], ["b", 2]] + tm.assert_frame_equal(df, exp_multi_row) + + with pytest.raises(ValueError): + df = orig.copy() + df.iloc[2:4, :] = [["c", 2], ["c", 2]] + + # assign a part of a column with dtype == categorical -> + # exp_parts_cats_col + df = orig.copy() + df.iloc[2:4, 0] = Categorical(["b", "b"], categories=["a", "b"]) + tm.assert_frame_equal(df, exp_parts_cats_col) + + with pytest.raises(ValueError): + # different categories -> not sure if this should fail or pass + df = orig.copy() + df.iloc[2:4, 0] = Categorical(list("bb"), categories=list("abc")) + + with pytest.raises(ValueError): + # different values + df = orig.copy() + df.iloc[2:4, 0] = Categorical(list("cc"), categories=list("abc")) + + # assign a part of a column with dtype != categorical -> + # exp_parts_cats_col + df = orig.copy() + df.iloc[2:4, 0] = ["b", "b"] + tm.assert_frame_equal(df, exp_parts_cats_col) + + with pytest.raises(ValueError): + df.iloc[2:4, 0] = ["c", "c"] + + # loc + # ############## + # - assign a single value -> exp_single_cats_value + df = orig.copy() + df.loc["j", "cats"] = "b" + tm.assert_frame_equal(df, exp_single_cats_value) + + df = orig.copy() + df.loc[df.index == "j", "cats"] = "b" + tm.assert_frame_equal(df, exp_single_cats_value) + + # - assign a single value not in the current categories set + with pytest.raises(ValueError): + df = orig.copy() + df.loc["j", "cats"] = "c" + + # - assign a complete row (mixed values) -> exp_single_row + df = orig.copy() + df.loc["j", :] = ["b", 2] + tm.assert_frame_equal(df, exp_single_row) + + # - assign a complete row (mixed values) not in categories set + with pytest.raises(ValueError): + df = orig.copy() + df.loc["j", :] = ["c", 2] + + # - assign multiple rows (mixed values) -> exp_multi_row + df = orig.copy() + df.loc["j":"k", :] = [["b", 2], ["b", 2]] + tm.assert_frame_equal(df, exp_multi_row) + + with pytest.raises(ValueError): + df = orig.copy() + df.loc["j":"k", :] = [["c", 2], ["c", 2]] + + # assign a part of a column with dtype == categorical -> + # exp_parts_cats_col + df = orig.copy() + df.loc["j":"k", "cats"] = Categorical(["b", "b"], categories=["a", "b"]) + tm.assert_frame_equal(df, exp_parts_cats_col) + + with pytest.raises(ValueError): + # different categories -> not sure if this should fail or pass + df = orig.copy() + df.loc["j":"k", "cats"] = Categorical( + ["b", "b"], categories=["a", "b", "c"] + ) + + with pytest.raises(ValueError): + # different values + df = orig.copy() + df.loc["j":"k", "cats"] = Categorical( + ["c", "c"], categories=["a", "b", "c"] + ) + + # assign a part of a column with dtype != categorical -> + # exp_parts_cats_col + df = orig.copy() + df.loc["j":"k", "cats"] = ["b", "b"] + tm.assert_frame_equal(df, exp_parts_cats_col) + + with pytest.raises(ValueError): + df.loc["j":"k", "cats"] = ["c", "c"] + + # loc + # ############## + # - assign a single value -> exp_single_cats_value + df = orig.copy() + df.loc["j", df.columns[0]] = "b" + tm.assert_frame_equal(df, exp_single_cats_value) + + df = orig.copy() + df.loc[df.index == "j", df.columns[0]] = "b" + tm.assert_frame_equal(df, exp_single_cats_value) + + # - assign a single value not in the current categories set + with pytest.raises(ValueError): + df = orig.copy() + df.loc["j", df.columns[0]] = "c" + + # - assign a complete row (mixed values) -> exp_single_row + df = orig.copy() + df.loc["j", :] = ["b", 2] + tm.assert_frame_equal(df, exp_single_row) + + # - assign a complete row (mixed values) not in categories set + with pytest.raises(ValueError): + df = orig.copy() + df.loc["j", :] = ["c", 2] + + # - assign multiple rows (mixed values) -> exp_multi_row + df = orig.copy() + df.loc["j":"k", :] = [["b", 2], ["b", 2]] + tm.assert_frame_equal(df, exp_multi_row) + + with pytest.raises(ValueError): + df = orig.copy() + df.loc["j":"k", :] = [["c", 2], ["c", 2]] + + # assign a part of a column with dtype == categorical -> + # exp_parts_cats_col + df = orig.copy() + df.loc["j":"k", df.columns[0]] = Categorical(["b", "b"], categories=["a", "b"]) + tm.assert_frame_equal(df, exp_parts_cats_col) + + with pytest.raises(ValueError): + # different categories -> not sure if this should fail or pass + df = orig.copy() + df.loc["j":"k", df.columns[0]] = Categorical( + ["b", "b"], categories=["a", "b", "c"] + ) + + with pytest.raises(ValueError): + # different values + df = orig.copy() + df.loc["j":"k", df.columns[0]] = Categorical( + ["c", "c"], categories=["a", "b", "c"] + ) + + # assign a part of a column with dtype != categorical -> + # exp_parts_cats_col + df = orig.copy() + df.loc["j":"k", df.columns[0]] = ["b", "b"] + tm.assert_frame_equal(df, exp_parts_cats_col) + + with pytest.raises(ValueError): + df.loc["j":"k", df.columns[0]] = ["c", "c"] + + # iat + df = orig.copy() + df.iat[2, 0] = "b" + tm.assert_frame_equal(df, exp_single_cats_value) + + # - assign a single value not in the current categories set + with pytest.raises(ValueError): + df = orig.copy() + df.iat[2, 0] = "c" + + # at + # - assign a single value -> exp_single_cats_value + df = orig.copy() + df.at["j", "cats"] = "b" + tm.assert_frame_equal(df, exp_single_cats_value) + + # - assign a single value not in the current categories set + with pytest.raises(ValueError): + df = orig.copy() + df.at["j", "cats"] = "c" + + # fancy indexing + catsf = Categorical( + ["a", "a", "c", "c", "a", "a", "a"], categories=["a", "b", "c"] + ) + idxf = Index(["h", "i", "j", "k", "l", "m", "n"]) + valuesf = [1, 1, 3, 3, 1, 1, 1] + df = DataFrame({"cats": catsf, "values": valuesf}, index=idxf) + + exp_fancy = exp_multi_row.copy() + exp_fancy["cats"].cat.set_categories(["a", "b", "c"], inplace=True) + + df[df["cats"] == "c"] = ["b", 2] + # category c is kept in .categories + tm.assert_frame_equal(df, exp_fancy) + + # set_value + df = orig.copy() + df.at["j", "cats"] = "b" + tm.assert_frame_equal(df, exp_single_cats_value) + + with pytest.raises(ValueError): + df = orig.copy() + df.at["j", "cats"] = "c" + + # Assigning a Category to parts of a int/... column uses the values of + # the Categorical + df = DataFrame({"a": [1, 1, 1, 1, 1], "b": list("aaaaa")}) + exp = DataFrame({"a": [1, "b", "b", 1, 1], "b": list("aabba")}) + df.loc[1:2, "a"] = Categorical(["b", "b"], categories=["a", "b"]) + df.loc[2:3, "b"] = Categorical(["b", "b"], categories=["a", "b"]) + tm.assert_frame_equal(df, exp) + + def test_functions_no_warnings(self): + df = DataFrame({"value": np.random.randint(0, 100, 20)}) + labels = ["{0} - {1}".format(i, i + 9) for i in range(0, 100, 10)] + with tm.assert_produces_warning(False): + df["group"] = pd.cut( + df.value, range(0, 105, 10), right=False, labels=labels + ) + + def test_loc_indexing_preserves_index_category_dtype(self): + # GH 15166 + df = DataFrame( + data=np.arange(2, 22, 2), + index=pd.MultiIndex( + levels=[pd.CategoricalIndex(["a", "b"]), range(10)], + codes=[[0] * 5 + [1] * 5, range(10)], + names=["Index1", "Index2"], + ), + ) + + expected = pd.CategoricalIndex( + ["a", "b"], + categories=["a", "b"], + ordered=False, + name="Index1", + dtype="category", + ) + + result = df.index.levels[0] + tm.assert_index_equal(result, expected) + + result = df.loc[["a"]].index.levels[0] + tm.assert_index_equal(result, expected) + + def test_wrong_length_cat_dtype_raises(self): + # GH29523 + cat = pd.Categorical.from_codes([0, 1, 1, 0, 1, 2], ["a", "b", "c"]) + df = pd.DataFrame({"bar": range(10)}) + err = "Length of values does not match length of index" + with pytest.raises(ValueError, match=err): + df["foo"] = cat diff --git a/pandas/tests/frame/indexing/test_datetime.py b/pandas/tests/frame/indexing/test_datetime.py new file mode 100644 index 0000000000000..bde35c04acf4f --- /dev/null +++ b/pandas/tests/frame/indexing/test_datetime.py @@ -0,0 +1,62 @@ +import pandas as pd +from pandas import DataFrame, Index, Series, date_range, notna +import pandas.util.testing as tm + + +class TestDataFrameIndexingDatetimeWithTZ: + def test_setitem(self, timezone_frame): + + df = timezone_frame + idx = df["B"].rename("foo") + + # setitem + df["C"] = idx + tm.assert_series_equal(df["C"], Series(idx, name="C")) + + df["D"] = "foo" + df["D"] = idx + tm.assert_series_equal(df["D"], Series(idx, name="D")) + del df["D"] + + # assert that A & C are not sharing the same base (e.g. they + # are copies) + b1 = df._data.blocks[1] + b2 = df._data.blocks[2] + tm.assert_extension_array_equal(b1.values, b2.values) + assert id(b1.values._data.base) != id(b2.values._data.base) + + # with nan + df2 = df.copy() + df2.iloc[1, 1] = pd.NaT + df2.iloc[1, 2] = pd.NaT + result = df2["B"] + tm.assert_series_equal(notna(result), Series([True, False, True], name="B")) + tm.assert_series_equal(df2.dtypes, df.dtypes) + + def test_set_reset(self): + + idx = Index(date_range("20130101", periods=3, tz="US/Eastern"), name="foo") + + # set/reset + df = DataFrame({"A": [0, 1, 2]}, index=idx) + result = df.reset_index() + assert result["foo"].dtype, "M8[ns, US/Eastern" + + df = result.set_index("foo") + tm.assert_index_equal(df.index, idx) + + def test_transpose(self, timezone_frame): + + result = timezone_frame.T + expected = DataFrame(timezone_frame.values.T) + expected.index = ["A", "B", "C"] + tm.assert_frame_equal(result, expected) + + def test_scalar_assignment(self): + # issue #19843 + df = pd.DataFrame(index=(0, 1, 2)) + df["now"] = pd.Timestamp("20130101", tz="UTC") + expected = pd.DataFrame( + {"now": pd.Timestamp("20130101", tz="UTC")}, index=[0, 1, 2] + ) + tm.assert_frame_equal(df, expected) diff --git a/pandas/tests/frame/test_indexing.py b/pandas/tests/frame/indexing/test_indexing.py similarity index 73% rename from pandas/tests/frame/test_indexing.py rename to pandas/tests/frame/indexing/test_indexing.py index e37f734c6235e..24a431fe42cf8 100644 --- a/pandas/tests/frame/test_indexing.py +++ b/pandas/tests/frame/indexing/test_indexing.py @@ -7,12 +7,10 @@ from pandas._libs.tslib import iNaT -from pandas.core.dtypes.common import is_float_dtype, is_integer, is_scalar -from pandas.core.dtypes.dtypes import CategoricalDtype +from pandas.core.dtypes.common import is_float_dtype, is_integer import pandas as pd from pandas import ( - Categorical, DataFrame, DatetimeIndex, Index, @@ -2695,576 +2693,6 @@ def test_boolean_indexing_mixed(self): with pytest.raises(TypeError, match=msg): df[df > 0.3] = 1 - def test_where(self, float_string_frame, mixed_float_frame, mixed_int_frame): - default_frame = DataFrame(np.random.randn(5, 3), columns=["A", "B", "C"]) - - def _safe_add(df): - # only add to the numeric items - def is_ok(s): - return ( - issubclass(s.dtype.type, (np.integer, np.floating)) - and s.dtype != "uint8" - ) - - return DataFrame( - dict((c, s + 1) if is_ok(s) else (c, s) for c, s in df.items()) - ) - - def _check_get(df, cond, check_dtypes=True): - other1 = _safe_add(df) - rs = df.where(cond, other1) - rs2 = df.where(cond.values, other1) - for k, v in rs.items(): - exp = Series(np.where(cond[k], df[k], other1[k]), index=v.index) - tm.assert_series_equal(v, exp, check_names=False) - tm.assert_frame_equal(rs, rs2) - - # dtypes - if check_dtypes: - assert (rs.dtypes == df.dtypes).all() - - # check getting - for df in [ - default_frame, - float_string_frame, - mixed_float_frame, - mixed_int_frame, - ]: - if df is float_string_frame: - with pytest.raises(TypeError): - df > 0 - continue - cond = df > 0 - _check_get(df, cond) - - # upcasting case (GH # 2794) - df = DataFrame( - { - c: Series([1] * 3, dtype=c) - for c in ["float32", "float64", "int32", "int64"] - } - ) - df.iloc[1, :] = 0 - result = df.dtypes - expected = Series( - [ - np.dtype("float32"), - np.dtype("float64"), - np.dtype("int32"), - np.dtype("int64"), - ], - index=["float32", "float64", "int32", "int64"], - ) - - # when we don't preserve boolean casts - # - # expected = Series({ 'float32' : 1, 'float64' : 3 }) - - tm.assert_series_equal(result, expected) - - # aligning - def _check_align(df, cond, other, check_dtypes=True): - rs = df.where(cond, other) - for i, k in enumerate(rs.columns): - result = rs[k] - d = df[k].values - c = cond[k].reindex(df[k].index).fillna(False).values - - if is_scalar(other): - o = other - else: - if isinstance(other, np.ndarray): - o = Series(other[:, i], index=result.index).values - else: - o = other[k].values - - new_values = d if c.all() else np.where(c, d, o) - expected = Series(new_values, index=result.index, name=k) - - # since we can't always have the correct numpy dtype - # as numpy doesn't know how to downcast, don't check - tm.assert_series_equal(result, expected, check_dtype=False) - - # dtypes - # can't check dtype when other is an ndarray - - if check_dtypes and not isinstance(other, np.ndarray): - assert (rs.dtypes == df.dtypes).all() - - for df in [float_string_frame, mixed_float_frame, mixed_int_frame]: - if df is float_string_frame: - with pytest.raises(TypeError): - df > 0 - continue - - # other is a frame - cond = (df > 0)[1:] - _check_align(df, cond, _safe_add(df)) - - # check other is ndarray - cond = df > 0 - _check_align(df, cond, (_safe_add(df).values)) - - # integers are upcast, so don't check the dtypes - cond = df > 0 - check_dtypes = all(not issubclass(s.type, np.integer) for s in df.dtypes) - _check_align(df, cond, np.nan, check_dtypes=check_dtypes) - - # invalid conditions - df = default_frame - err1 = (df + 1).values[0:2, :] - msg = "other must be the same shape as self when an ndarray" - with pytest.raises(ValueError, match=msg): - df.where(cond, err1) - - err2 = cond.iloc[:2, :].values - other1 = _safe_add(df) - msg = "Array conditional must be same shape as self" - with pytest.raises(ValueError, match=msg): - df.where(err2, other1) - - with pytest.raises(ValueError, match=msg): - df.mask(True) - with pytest.raises(ValueError, match=msg): - df.mask(0) - - # where inplace - def _check_set(df, cond, check_dtypes=True): - dfi = df.copy() - econd = cond.reindex_like(df).fillna(True) - expected = dfi.mask(~econd) - - dfi.where(cond, np.nan, inplace=True) - tm.assert_frame_equal(dfi, expected) - - # dtypes (and confirm upcasts)x - if check_dtypes: - for k, v in df.dtypes.items(): - if issubclass(v.type, np.integer) and not cond[k].all(): - v = np.dtype("float64") - assert dfi[k].dtype == v - - for df in [ - default_frame, - float_string_frame, - mixed_float_frame, - mixed_int_frame, - ]: - if df is float_string_frame: - with pytest.raises(TypeError): - df > 0 - continue - - cond = df > 0 - _check_set(df, cond) - - cond = df >= 0 - _check_set(df, cond) - - # aligning - cond = (df >= 0)[1:] - _check_set(df, cond) - - # GH 10218 - # test DataFrame.where with Series slicing - df = DataFrame({"a": range(3), "b": range(4, 7)}) - result = df.where(df["a"] == 1) - expected = df[df["a"] == 1].reindex(df.index) - tm.assert_frame_equal(result, expected) - - @pytest.mark.parametrize("klass", [list, tuple, np.array]) - def test_where_array_like(self, klass): - # see gh-15414 - df = DataFrame({"a": [1, 2, 3]}) - cond = [[False], [True], [True]] - expected = DataFrame({"a": [np.nan, 2, 3]}) - - result = df.where(klass(cond)) - tm.assert_frame_equal(result, expected) - - df["b"] = 2 - expected["b"] = [2, np.nan, 2] - cond = [[False, True], [True, False], [True, True]] - - result = df.where(klass(cond)) - tm.assert_frame_equal(result, expected) - - @pytest.mark.parametrize( - "cond", - [ - [[1], [0], [1]], - Series([[2], [5], [7]]), - DataFrame({"a": [2, 5, 7]}), - [["True"], ["False"], ["True"]], - [[Timestamp("2017-01-01")], [pd.NaT], [Timestamp("2017-01-02")]], - ], - ) - def test_where_invalid_input_single(self, cond): - # see gh-15414: only boolean arrays accepted - df = DataFrame({"a": [1, 2, 3]}) - msg = "Boolean array expected for the condition" - - with pytest.raises(ValueError, match=msg): - df.where(cond) - - @pytest.mark.parametrize( - "cond", - [ - [[0, 1], [1, 0], [1, 1]], - Series([[0, 2], [5, 0], [4, 7]]), - [["False", "True"], ["True", "False"], ["True", "True"]], - DataFrame({"a": [2, 5, 7], "b": [4, 8, 9]}), - [ - [pd.NaT, Timestamp("2017-01-01")], - [Timestamp("2017-01-02"), pd.NaT], - [Timestamp("2017-01-03"), Timestamp("2017-01-03")], - ], - ], - ) - def test_where_invalid_input_multiple(self, cond): - # see gh-15414: only boolean arrays accepted - df = DataFrame({"a": [1, 2, 3], "b": [2, 2, 2]}) - msg = "Boolean array expected for the condition" - - with pytest.raises(ValueError, match=msg): - df.where(cond) - - def test_where_dataframe_col_match(self): - df = DataFrame([[1, 2, 3], [4, 5, 6]]) - cond = DataFrame([[True, False, True], [False, False, True]]) - - result = df.where(cond) - expected = DataFrame([[1.0, np.nan, 3], [np.nan, np.nan, 6]]) - tm.assert_frame_equal(result, expected) - - # this *does* align, though has no matching columns - cond.columns = ["a", "b", "c"] - result = df.where(cond) - expected = DataFrame(np.nan, index=df.index, columns=df.columns) - tm.assert_frame_equal(result, expected) - - def test_where_ndframe_align(self): - msg = "Array conditional must be same shape as self" - df = DataFrame([[1, 2, 3], [4, 5, 6]]) - - cond = [True] - with pytest.raises(ValueError, match=msg): - df.where(cond) - - expected = DataFrame([[1, 2, 3], [np.nan, np.nan, np.nan]]) - - out = df.where(Series(cond)) - tm.assert_frame_equal(out, expected) - - cond = np.array([False, True, False, True]) - with pytest.raises(ValueError, match=msg): - df.where(cond) - - expected = DataFrame([[np.nan, np.nan, np.nan], [4, 5, 6]]) - - out = df.where(Series(cond)) - tm.assert_frame_equal(out, expected) - - def test_where_bug(self): - # see gh-2793 - df = DataFrame( - {"a": [1.0, 2.0, 3.0, 4.0], "b": [4.0, 3.0, 2.0, 1.0]}, dtype="float64" - ) - expected = DataFrame( - {"a": [np.nan, np.nan, 3.0, 4.0], "b": [4.0, 3.0, np.nan, np.nan]}, - dtype="float64", - ) - result = df.where(df > 2, np.nan) - tm.assert_frame_equal(result, expected) - - result = df.copy() - result.where(result > 2, np.nan, inplace=True) - tm.assert_frame_equal(result, expected) - - def test_where_bug_mixed(self, sint_dtype): - # see gh-2793 - df = DataFrame( - { - "a": np.array([1, 2, 3, 4], dtype=sint_dtype), - "b": np.array([4.0, 3.0, 2.0, 1.0], dtype="float64"), - } - ) - - expected = DataFrame( - {"a": [np.nan, np.nan, 3.0, 4.0], "b": [4.0, 3.0, np.nan, np.nan]}, - dtype="float64", - ) - - result = df.where(df > 2, np.nan) - tm.assert_frame_equal(result, expected) - - result = df.copy() - result.where(result > 2, np.nan, inplace=True) - tm.assert_frame_equal(result, expected) - - def test_where_bug_transposition(self): - # see gh-7506 - a = DataFrame({0: [1, 2], 1: [3, 4], 2: [5, 6]}) - b = DataFrame({0: [np.nan, 8], 1: [9, np.nan], 2: [np.nan, np.nan]}) - do_not_replace = b.isna() | (a > b) - - expected = a.copy() - expected[~do_not_replace] = b - - result = a.where(do_not_replace, b) - tm.assert_frame_equal(result, expected) - - a = DataFrame({0: [4, 6], 1: [1, 0]}) - b = DataFrame({0: [np.nan, 3], 1: [3, np.nan]}) - do_not_replace = b.isna() | (a > b) - - expected = a.copy() - expected[~do_not_replace] = b - - result = a.where(do_not_replace, b) - tm.assert_frame_equal(result, expected) - - def test_where_datetime(self): - - # GH 3311 - df = DataFrame( - dict( - A=date_range("20130102", periods=5), - B=date_range("20130104", periods=5), - C=np.random.randn(5), - ) - ) - - stamp = datetime(2013, 1, 3) - with pytest.raises(TypeError): - df > stamp - - result = df[df.iloc[:, :-1] > stamp] - - expected = df.copy() - expected.loc[[0, 1], "A"] = np.nan - expected.loc[:, "C"] = np.nan - tm.assert_frame_equal(result, expected) - - def test_where_none(self): - # GH 4667 - # setting with None changes dtype - df = DataFrame({"series": Series(range(10))}).astype(float) - df[df > 7] = None - expected = DataFrame( - {"series": Series([0, 1, 2, 3, 4, 5, 6, 7, np.nan, np.nan])} - ) - tm.assert_frame_equal(df, expected) - - # GH 7656 - df = DataFrame( - [ - {"A": 1, "B": np.nan, "C": "Test"}, - {"A": np.nan, "B": "Test", "C": np.nan}, - ] - ) - msg = "boolean setting on mixed-type" - - with pytest.raises(TypeError, match=msg): - df.where(~isna(df), None, inplace=True) - - def test_where_empty_df_and_empty_cond_having_non_bool_dtypes(self): - # see gh-21947 - df = pd.DataFrame(columns=["a"]) - cond = df.applymap(lambda x: x > 0) - - result = df.where(cond) - tm.assert_frame_equal(result, df) - - def test_where_align(self): - def create(): - df = DataFrame(np.random.randn(10, 3)) - df.iloc[3:5, 0] = np.nan - df.iloc[4:6, 1] = np.nan - df.iloc[5:8, 2] = np.nan - return df - - # series - df = create() - expected = df.fillna(df.mean()) - result = df.where(pd.notna(df), df.mean(), axis="columns") - tm.assert_frame_equal(result, expected) - - df.where(pd.notna(df), df.mean(), inplace=True, axis="columns") - tm.assert_frame_equal(df, expected) - - df = create().fillna(0) - expected = df.apply(lambda x, y: x.where(x > 0, y), y=df[0]) - result = df.where(df > 0, df[0], axis="index") - tm.assert_frame_equal(result, expected) - result = df.where(df > 0, df[0], axis="rows") - tm.assert_frame_equal(result, expected) - - # frame - df = create() - expected = df.fillna(1) - result = df.where( - pd.notna(df), DataFrame(1, index=df.index, columns=df.columns) - ) - tm.assert_frame_equal(result, expected) - - def test_where_complex(self): - # GH 6345 - expected = DataFrame([[1 + 1j, 2], [np.nan, 4 + 1j]], columns=["a", "b"]) - df = DataFrame([[1 + 1j, 2], [5 + 1j, 4 + 1j]], columns=["a", "b"]) - df[df.abs() >= 5] = np.nan - tm.assert_frame_equal(df, expected) - - def test_where_axis(self): - # GH 9736 - df = DataFrame(np.random.randn(2, 2)) - mask = DataFrame([[False, False], [False, False]]) - s = Series([0, 1]) - - expected = DataFrame([[0, 0], [1, 1]], dtype="float64") - result = df.where(mask, s, axis="index") - tm.assert_frame_equal(result, expected) - - result = df.copy() - result.where(mask, s, axis="index", inplace=True) - tm.assert_frame_equal(result, expected) - - expected = DataFrame([[0, 1], [0, 1]], dtype="float64") - result = df.where(mask, s, axis="columns") - tm.assert_frame_equal(result, expected) - - result = df.copy() - result.where(mask, s, axis="columns", inplace=True) - tm.assert_frame_equal(result, expected) - - # Upcast needed - df = DataFrame([[1, 2], [3, 4]], dtype="int64") - mask = DataFrame([[False, False], [False, False]]) - s = Series([0, np.nan]) - - expected = DataFrame([[0, 0], [np.nan, np.nan]], dtype="float64") - result = df.where(mask, s, axis="index") - tm.assert_frame_equal(result, expected) - - result = df.copy() - result.where(mask, s, axis="index", inplace=True) - tm.assert_frame_equal(result, expected) - - expected = DataFrame([[0, np.nan], [0, np.nan]]) - result = df.where(mask, s, axis="columns") - tm.assert_frame_equal(result, expected) - - expected = DataFrame( - { - 0: np.array([0, 0], dtype="int64"), - 1: np.array([np.nan, np.nan], dtype="float64"), - } - ) - result = df.copy() - result.where(mask, s, axis="columns", inplace=True) - tm.assert_frame_equal(result, expected) - - # Multiple dtypes (=> multiple Blocks) - df = pd.concat( - [ - DataFrame(np.random.randn(10, 2)), - DataFrame(np.random.randint(0, 10, size=(10, 2)), dtype="int64"), - ], - ignore_index=True, - axis=1, - ) - mask = DataFrame(False, columns=df.columns, index=df.index) - s1 = Series(1, index=df.columns) - s2 = Series(2, index=df.index) - - result = df.where(mask, s1, axis="columns") - expected = DataFrame(1.0, columns=df.columns, index=df.index) - expected[2] = expected[2].astype("int64") - expected[3] = expected[3].astype("int64") - tm.assert_frame_equal(result, expected) - - result = df.copy() - result.where(mask, s1, axis="columns", inplace=True) - tm.assert_frame_equal(result, expected) - - result = df.where(mask, s2, axis="index") - expected = DataFrame(2.0, columns=df.columns, index=df.index) - expected[2] = expected[2].astype("int64") - expected[3] = expected[3].astype("int64") - tm.assert_frame_equal(result, expected) - - result = df.copy() - result.where(mask, s2, axis="index", inplace=True) - tm.assert_frame_equal(result, expected) - - # DataFrame vs DataFrame - d1 = df.copy().drop(1, axis=0) - expected = df.copy() - expected.loc[1, :] = np.nan - - result = df.where(mask, d1) - tm.assert_frame_equal(result, expected) - result = df.where(mask, d1, axis="index") - tm.assert_frame_equal(result, expected) - result = df.copy() - result.where(mask, d1, inplace=True) - tm.assert_frame_equal(result, expected) - result = df.copy() - result.where(mask, d1, inplace=True, axis="index") - tm.assert_frame_equal(result, expected) - - d2 = df.copy().drop(1, axis=1) - expected = df.copy() - expected.loc[:, 1] = np.nan - - result = df.where(mask, d2) - tm.assert_frame_equal(result, expected) - result = df.where(mask, d2, axis="columns") - tm.assert_frame_equal(result, expected) - result = df.copy() - result.where(mask, d2, inplace=True) - tm.assert_frame_equal(result, expected) - result = df.copy() - result.where(mask, d2, inplace=True, axis="columns") - tm.assert_frame_equal(result, expected) - - def test_where_callable(self): - # GH 12533 - df = DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]]) - result = df.where(lambda x: x > 4, lambda x: x + 1) - exp = DataFrame([[2, 3, 4], [5, 5, 6], [7, 8, 9]]) - tm.assert_frame_equal(result, exp) - tm.assert_frame_equal(result, df.where(df > 4, df + 1)) - - # return ndarray and scalar - result = df.where(lambda x: (x % 2 == 0).values, lambda x: 99) - exp = DataFrame([[99, 2, 99], [4, 99, 6], [99, 8, 99]]) - tm.assert_frame_equal(result, exp) - tm.assert_frame_equal(result, df.where(df % 2 == 0, 99)) - - # chain - result = (df + 2).where(lambda x: x > 8, lambda x: x + 10) - exp = DataFrame([[13, 14, 15], [16, 17, 18], [9, 10, 11]]) - tm.assert_frame_equal(result, exp) - tm.assert_frame_equal(result, (df + 2).where((df + 2) > 8, (df + 2) + 10)) - - def test_where_tz_values(self, tz_naive_fixture): - df1 = DataFrame( - DatetimeIndex(["20150101", "20150102", "20150103"], tz=tz_naive_fixture), - columns=["date"], - ) - df2 = DataFrame( - DatetimeIndex(["20150103", "20150104", "20150105"], tz=tz_naive_fixture), - columns=["date"], - ) - mask = DataFrame([True, True, False], columns=["date"]) - exp = DataFrame( - DatetimeIndex(["20150101", "20150102", "20150105"], tz=tz_naive_fixture), - columns=["date"], - ) - result = df1.where(mask, df2) - tm.assert_frame_equal(exp, result) - def test_mask(self): df = DataFrame(np.random.randn(5, 3)) cond = df > 0 @@ -3402,65 +2830,6 @@ def test_interval_index(self): tm.assert_series_equal(result, expected) -class TestDataFrameIndexingDatetimeWithTZ: - def test_setitem(self, timezone_frame): - - df = timezone_frame - idx = df["B"].rename("foo") - - # setitem - df["C"] = idx - tm.assert_series_equal(df["C"], Series(idx, name="C")) - - df["D"] = "foo" - df["D"] = idx - tm.assert_series_equal(df["D"], Series(idx, name="D")) - del df["D"] - - # assert that A & C are not sharing the same base (e.g. they - # are copies) - b1 = df._data.blocks[1] - b2 = df._data.blocks[2] - tm.assert_extension_array_equal(b1.values, b2.values) - assert id(b1.values._data.base) != id(b2.values._data.base) - - # with nan - df2 = df.copy() - df2.iloc[1, 1] = pd.NaT - df2.iloc[1, 2] = pd.NaT - result = df2["B"] - tm.assert_series_equal(notna(result), Series([True, False, True], name="B")) - tm.assert_series_equal(df2.dtypes, df.dtypes) - - def test_set_reset(self): - - idx = Index(date_range("20130101", periods=3, tz="US/Eastern"), name="foo") - - # set/reset - df = DataFrame({"A": [0, 1, 2]}, index=idx) - result = df.reset_index() - assert result["foo"].dtype, "M8[ns, US/Eastern" - - df = result.set_index("foo") - tm.assert_index_equal(df.index, idx) - - def test_transpose(self, timezone_frame): - - result = timezone_frame.T - expected = DataFrame(timezone_frame.values.T) - expected.index = ["A", "B", "C"] - tm.assert_frame_equal(result, expected) - - def test_scalar_assignment(self): - # issue #19843 - df = pd.DataFrame(index=(0, 1, 2)) - df["now"] = pd.Timestamp("20130101", tz="UTC") - expected = pd.DataFrame( - {"now": pd.Timestamp("20130101", tz="UTC")}, index=[0, 1, 2] - ) - tm.assert_frame_equal(df, expected) - - class TestDataFrameIndexingUInt64: def test_setitem(self, uint64_frame): @@ -3509,383 +2878,3 @@ def test_transpose(self, uint64_frame): expected = DataFrame(uint64_frame.values.T) expected.index = ["A", "B"] tm.assert_frame_equal(result, expected) - - -class TestDataFrameIndexingCategorical: - def test_assignment(self): - # assignment - df = DataFrame( - {"value": np.array(np.random.randint(0, 10000, 100), dtype="int32")} - ) - labels = Categorical( - ["{0} - {1}".format(i, i + 499) for i in range(0, 10000, 500)] - ) - - df = df.sort_values(by=["value"], ascending=True) - s = pd.cut(df.value, range(0, 10500, 500), right=False, labels=labels) - d = s.values - df["D"] = d - str(df) - - result = df.dtypes - expected = Series( - [np.dtype("int32"), CategoricalDtype(categories=labels, ordered=False)], - index=["value", "D"], - ) - tm.assert_series_equal(result, expected) - - df["E"] = s - str(df) - - result = df.dtypes - expected = Series( - [ - np.dtype("int32"), - CategoricalDtype(categories=labels, ordered=False), - CategoricalDtype(categories=labels, ordered=False), - ], - index=["value", "D", "E"], - ) - tm.assert_series_equal(result, expected) - - result1 = df["D"] - result2 = df["E"] - tm.assert_categorical_equal(result1._data._block.values, d) - - # sorting - s.name = "E" - tm.assert_series_equal(result2.sort_index(), s.sort_index()) - - cat = Categorical([1, 2, 3, 10], categories=[1, 2, 3, 4, 10]) - df = DataFrame(Series(cat)) - - def test_assigning_ops(self): - # systematically test the assigning operations: - # for all slicing ops: - # for value in categories and value not in categories: - - # - assign a single value -> exp_single_cats_value - - # - assign a complete row (mixed values) -> exp_single_row - - # assign multiple rows (mixed values) (-> array) -> exp_multi_row - - # assign a part of a column with dtype == categorical -> - # exp_parts_cats_col - - # assign a part of a column with dtype != categorical -> - # exp_parts_cats_col - - cats = Categorical(["a", "a", "a", "a", "a", "a", "a"], categories=["a", "b"]) - idx = Index(["h", "i", "j", "k", "l", "m", "n"]) - values = [1, 1, 1, 1, 1, 1, 1] - orig = DataFrame({"cats": cats, "values": values}, index=idx) - - # the expected values - # changed single row - cats1 = Categorical(["a", "a", "b", "a", "a", "a", "a"], categories=["a", "b"]) - idx1 = Index(["h", "i", "j", "k", "l", "m", "n"]) - values1 = [1, 1, 2, 1, 1, 1, 1] - exp_single_row = DataFrame({"cats": cats1, "values": values1}, index=idx1) - - # changed multiple rows - cats2 = Categorical(["a", "a", "b", "b", "a", "a", "a"], categories=["a", "b"]) - idx2 = Index(["h", "i", "j", "k", "l", "m", "n"]) - values2 = [1, 1, 2, 2, 1, 1, 1] - exp_multi_row = DataFrame({"cats": cats2, "values": values2}, index=idx2) - - # changed part of the cats column - cats3 = Categorical(["a", "a", "b", "b", "a", "a", "a"], categories=["a", "b"]) - idx3 = Index(["h", "i", "j", "k", "l", "m", "n"]) - values3 = [1, 1, 1, 1, 1, 1, 1] - exp_parts_cats_col = DataFrame({"cats": cats3, "values": values3}, index=idx3) - - # changed single value in cats col - cats4 = Categorical(["a", "a", "b", "a", "a", "a", "a"], categories=["a", "b"]) - idx4 = Index(["h", "i", "j", "k", "l", "m", "n"]) - values4 = [1, 1, 1, 1, 1, 1, 1] - exp_single_cats_value = DataFrame( - {"cats": cats4, "values": values4}, index=idx4 - ) - - # iloc - # ############### - # - assign a single value -> exp_single_cats_value - df = orig.copy() - df.iloc[2, 0] = "b" - tm.assert_frame_equal(df, exp_single_cats_value) - - df = orig.copy() - df.iloc[df.index == "j", 0] = "b" - tm.assert_frame_equal(df, exp_single_cats_value) - - # - assign a single value not in the current categories set - with pytest.raises(ValueError): - df = orig.copy() - df.iloc[2, 0] = "c" - - # - assign a complete row (mixed values) -> exp_single_row - df = orig.copy() - df.iloc[2, :] = ["b", 2] - tm.assert_frame_equal(df, exp_single_row) - - # - assign a complete row (mixed values) not in categories set - with pytest.raises(ValueError): - df = orig.copy() - df.iloc[2, :] = ["c", 2] - - # - assign multiple rows (mixed values) -> exp_multi_row - df = orig.copy() - df.iloc[2:4, :] = [["b", 2], ["b", 2]] - tm.assert_frame_equal(df, exp_multi_row) - - with pytest.raises(ValueError): - df = orig.copy() - df.iloc[2:4, :] = [["c", 2], ["c", 2]] - - # assign a part of a column with dtype == categorical -> - # exp_parts_cats_col - df = orig.copy() - df.iloc[2:4, 0] = Categorical(["b", "b"], categories=["a", "b"]) - tm.assert_frame_equal(df, exp_parts_cats_col) - - with pytest.raises(ValueError): - # different categories -> not sure if this should fail or pass - df = orig.copy() - df.iloc[2:4, 0] = Categorical(list("bb"), categories=list("abc")) - - with pytest.raises(ValueError): - # different values - df = orig.copy() - df.iloc[2:4, 0] = Categorical(list("cc"), categories=list("abc")) - - # assign a part of a column with dtype != categorical -> - # exp_parts_cats_col - df = orig.copy() - df.iloc[2:4, 0] = ["b", "b"] - tm.assert_frame_equal(df, exp_parts_cats_col) - - with pytest.raises(ValueError): - df.iloc[2:4, 0] = ["c", "c"] - - # loc - # ############## - # - assign a single value -> exp_single_cats_value - df = orig.copy() - df.loc["j", "cats"] = "b" - tm.assert_frame_equal(df, exp_single_cats_value) - - df = orig.copy() - df.loc[df.index == "j", "cats"] = "b" - tm.assert_frame_equal(df, exp_single_cats_value) - - # - assign a single value not in the current categories set - with pytest.raises(ValueError): - df = orig.copy() - df.loc["j", "cats"] = "c" - - # - assign a complete row (mixed values) -> exp_single_row - df = orig.copy() - df.loc["j", :] = ["b", 2] - tm.assert_frame_equal(df, exp_single_row) - - # - assign a complete row (mixed values) not in categories set - with pytest.raises(ValueError): - df = orig.copy() - df.loc["j", :] = ["c", 2] - - # - assign multiple rows (mixed values) -> exp_multi_row - df = orig.copy() - df.loc["j":"k", :] = [["b", 2], ["b", 2]] - tm.assert_frame_equal(df, exp_multi_row) - - with pytest.raises(ValueError): - df = orig.copy() - df.loc["j":"k", :] = [["c", 2], ["c", 2]] - - # assign a part of a column with dtype == categorical -> - # exp_parts_cats_col - df = orig.copy() - df.loc["j":"k", "cats"] = Categorical(["b", "b"], categories=["a", "b"]) - tm.assert_frame_equal(df, exp_parts_cats_col) - - with pytest.raises(ValueError): - # different categories -> not sure if this should fail or pass - df = orig.copy() - df.loc["j":"k", "cats"] = Categorical( - ["b", "b"], categories=["a", "b", "c"] - ) - - with pytest.raises(ValueError): - # different values - df = orig.copy() - df.loc["j":"k", "cats"] = Categorical( - ["c", "c"], categories=["a", "b", "c"] - ) - - # assign a part of a column with dtype != categorical -> - # exp_parts_cats_col - df = orig.copy() - df.loc["j":"k", "cats"] = ["b", "b"] - tm.assert_frame_equal(df, exp_parts_cats_col) - - with pytest.raises(ValueError): - df.loc["j":"k", "cats"] = ["c", "c"] - - # loc - # ############## - # - assign a single value -> exp_single_cats_value - df = orig.copy() - df.loc["j", df.columns[0]] = "b" - tm.assert_frame_equal(df, exp_single_cats_value) - - df = orig.copy() - df.loc[df.index == "j", df.columns[0]] = "b" - tm.assert_frame_equal(df, exp_single_cats_value) - - # - assign a single value not in the current categories set - with pytest.raises(ValueError): - df = orig.copy() - df.loc["j", df.columns[0]] = "c" - - # - assign a complete row (mixed values) -> exp_single_row - df = orig.copy() - df.loc["j", :] = ["b", 2] - tm.assert_frame_equal(df, exp_single_row) - - # - assign a complete row (mixed values) not in categories set - with pytest.raises(ValueError): - df = orig.copy() - df.loc["j", :] = ["c", 2] - - # - assign multiple rows (mixed values) -> exp_multi_row - df = orig.copy() - df.loc["j":"k", :] = [["b", 2], ["b", 2]] - tm.assert_frame_equal(df, exp_multi_row) - - with pytest.raises(ValueError): - df = orig.copy() - df.loc["j":"k", :] = [["c", 2], ["c", 2]] - - # assign a part of a column with dtype == categorical -> - # exp_parts_cats_col - df = orig.copy() - df.loc["j":"k", df.columns[0]] = Categorical(["b", "b"], categories=["a", "b"]) - tm.assert_frame_equal(df, exp_parts_cats_col) - - with pytest.raises(ValueError): - # different categories -> not sure if this should fail or pass - df = orig.copy() - df.loc["j":"k", df.columns[0]] = Categorical( - ["b", "b"], categories=["a", "b", "c"] - ) - - with pytest.raises(ValueError): - # different values - df = orig.copy() - df.loc["j":"k", df.columns[0]] = Categorical( - ["c", "c"], categories=["a", "b", "c"] - ) - - # assign a part of a column with dtype != categorical -> - # exp_parts_cats_col - df = orig.copy() - df.loc["j":"k", df.columns[0]] = ["b", "b"] - tm.assert_frame_equal(df, exp_parts_cats_col) - - with pytest.raises(ValueError): - df.loc["j":"k", df.columns[0]] = ["c", "c"] - - # iat - df = orig.copy() - df.iat[2, 0] = "b" - tm.assert_frame_equal(df, exp_single_cats_value) - - # - assign a single value not in the current categories set - with pytest.raises(ValueError): - df = orig.copy() - df.iat[2, 0] = "c" - - # at - # - assign a single value -> exp_single_cats_value - df = orig.copy() - df.at["j", "cats"] = "b" - tm.assert_frame_equal(df, exp_single_cats_value) - - # - assign a single value not in the current categories set - with pytest.raises(ValueError): - df = orig.copy() - df.at["j", "cats"] = "c" - - # fancy indexing - catsf = Categorical( - ["a", "a", "c", "c", "a", "a", "a"], categories=["a", "b", "c"] - ) - idxf = Index(["h", "i", "j", "k", "l", "m", "n"]) - valuesf = [1, 1, 3, 3, 1, 1, 1] - df = DataFrame({"cats": catsf, "values": valuesf}, index=idxf) - - exp_fancy = exp_multi_row.copy() - exp_fancy["cats"].cat.set_categories(["a", "b", "c"], inplace=True) - - df[df["cats"] == "c"] = ["b", 2] - # category c is kept in .categories - tm.assert_frame_equal(df, exp_fancy) - - # set_value - df = orig.copy() - df.at["j", "cats"] = "b" - tm.assert_frame_equal(df, exp_single_cats_value) - - with pytest.raises(ValueError): - df = orig.copy() - df.at["j", "cats"] = "c" - - # Assigning a Category to parts of a int/... column uses the values of - # the Categorical - df = DataFrame({"a": [1, 1, 1, 1, 1], "b": list("aaaaa")}) - exp = DataFrame({"a": [1, "b", "b", 1, 1], "b": list("aabba")}) - df.loc[1:2, "a"] = Categorical(["b", "b"], categories=["a", "b"]) - df.loc[2:3, "b"] = Categorical(["b", "b"], categories=["a", "b"]) - tm.assert_frame_equal(df, exp) - - def test_functions_no_warnings(self): - df = DataFrame({"value": np.random.randint(0, 100, 20)}) - labels = ["{0} - {1}".format(i, i + 9) for i in range(0, 100, 10)] - with tm.assert_produces_warning(False): - df["group"] = pd.cut( - df.value, range(0, 105, 10), right=False, labels=labels - ) - - def test_loc_indexing_preserves_index_category_dtype(self): - # GH 15166 - df = DataFrame( - data=np.arange(2, 22, 2), - index=pd.MultiIndex( - levels=[pd.CategoricalIndex(["a", "b"]), range(10)], - codes=[[0] * 5 + [1] * 5, range(10)], - names=["Index1", "Index2"], - ), - ) - - expected = pd.CategoricalIndex( - ["a", "b"], - categories=["a", "b"], - ordered=False, - name="Index1", - dtype="category", - ) - - result = df.index.levels[0] - tm.assert_index_equal(result, expected) - - result = df.loc[["a"]].index.levels[0] - tm.assert_index_equal(result, expected) - - def test_wrong_length_cat_dtype_raises(self): - # GH29523 - cat = pd.Categorical.from_codes([0, 1, 1, 0, 1, 2], ["a", "b", "c"]) - df = pd.DataFrame({"bar": range(10)}) - err = "Length of values does not match length of index" - with pytest.raises(ValueError, match=err): - df["foo"] = cat diff --git a/pandas/tests/frame/indexing/test_where.py b/pandas/tests/frame/indexing/test_where.py new file mode 100644 index 0000000000000..4fea190f28d7b --- /dev/null +++ b/pandas/tests/frame/indexing/test_where.py @@ -0,0 +1,582 @@ +from datetime import datetime + +import numpy as np +import pytest + +from pandas.core.dtypes.common import is_scalar + +import pandas as pd +from pandas import DataFrame, DatetimeIndex, Series, Timestamp, date_range, isna +import pandas.util.testing as tm + + +class TestDataFrameIndexingWhere: + def test_where(self, float_string_frame, mixed_float_frame, mixed_int_frame): + default_frame = DataFrame(np.random.randn(5, 3), columns=["A", "B", "C"]) + + def _safe_add(df): + # only add to the numeric items + def is_ok(s): + return ( + issubclass(s.dtype.type, (np.integer, np.floating)) + and s.dtype != "uint8" + ) + + return DataFrame( + dict((c, s + 1) if is_ok(s) else (c, s) for c, s in df.items()) + ) + + def _check_get(df, cond, check_dtypes=True): + other1 = _safe_add(df) + rs = df.where(cond, other1) + rs2 = df.where(cond.values, other1) + for k, v in rs.items(): + exp = Series(np.where(cond[k], df[k], other1[k]), index=v.index) + tm.assert_series_equal(v, exp, check_names=False) + tm.assert_frame_equal(rs, rs2) + + # dtypes + if check_dtypes: + assert (rs.dtypes == df.dtypes).all() + + # check getting + for df in [ + default_frame, + float_string_frame, + mixed_float_frame, + mixed_int_frame, + ]: + if df is float_string_frame: + with pytest.raises(TypeError): + df > 0 + continue + cond = df > 0 + _check_get(df, cond) + + # upcasting case (GH # 2794) + df = DataFrame( + { + c: Series([1] * 3, dtype=c) + for c in ["float32", "float64", "int32", "int64"] + } + ) + df.iloc[1, :] = 0 + result = df.dtypes + expected = Series( + [ + np.dtype("float32"), + np.dtype("float64"), + np.dtype("int32"), + np.dtype("int64"), + ], + index=["float32", "float64", "int32", "int64"], + ) + + # when we don't preserve boolean casts + # + # expected = Series({ 'float32' : 1, 'float64' : 3 }) + + tm.assert_series_equal(result, expected) + + # aligning + def _check_align(df, cond, other, check_dtypes=True): + rs = df.where(cond, other) + for i, k in enumerate(rs.columns): + result = rs[k] + d = df[k].values + c = cond[k].reindex(df[k].index).fillna(False).values + + if is_scalar(other): + o = other + else: + if isinstance(other, np.ndarray): + o = Series(other[:, i], index=result.index).values + else: + o = other[k].values + + new_values = d if c.all() else np.where(c, d, o) + expected = Series(new_values, index=result.index, name=k) + + # since we can't always have the correct numpy dtype + # as numpy doesn't know how to downcast, don't check + tm.assert_series_equal(result, expected, check_dtype=False) + + # dtypes + # can't check dtype when other is an ndarray + + if check_dtypes and not isinstance(other, np.ndarray): + assert (rs.dtypes == df.dtypes).all() + + for df in [float_string_frame, mixed_float_frame, mixed_int_frame]: + if df is float_string_frame: + with pytest.raises(TypeError): + df > 0 + continue + + # other is a frame + cond = (df > 0)[1:] + _check_align(df, cond, _safe_add(df)) + + # check other is ndarray + cond = df > 0 + _check_align(df, cond, (_safe_add(df).values)) + + # integers are upcast, so don't check the dtypes + cond = df > 0 + check_dtypes = all(not issubclass(s.type, np.integer) for s in df.dtypes) + _check_align(df, cond, np.nan, check_dtypes=check_dtypes) + + # invalid conditions + df = default_frame + err1 = (df + 1).values[0:2, :] + msg = "other must be the same shape as self when an ndarray" + with pytest.raises(ValueError, match=msg): + df.where(cond, err1) + + err2 = cond.iloc[:2, :].values + other1 = _safe_add(df) + msg = "Array conditional must be same shape as self" + with pytest.raises(ValueError, match=msg): + df.where(err2, other1) + + with pytest.raises(ValueError, match=msg): + df.mask(True) + with pytest.raises(ValueError, match=msg): + df.mask(0) + + # where inplace + def _check_set(df, cond, check_dtypes=True): + dfi = df.copy() + econd = cond.reindex_like(df).fillna(True) + expected = dfi.mask(~econd) + + dfi.where(cond, np.nan, inplace=True) + tm.assert_frame_equal(dfi, expected) + + # dtypes (and confirm upcasts)x + if check_dtypes: + for k, v in df.dtypes.items(): + if issubclass(v.type, np.integer) and not cond[k].all(): + v = np.dtype("float64") + assert dfi[k].dtype == v + + for df in [ + default_frame, + float_string_frame, + mixed_float_frame, + mixed_int_frame, + ]: + if df is float_string_frame: + with pytest.raises(TypeError): + df > 0 + continue + + cond = df > 0 + _check_set(df, cond) + + cond = df >= 0 + _check_set(df, cond) + + # aligning + cond = (df >= 0)[1:] + _check_set(df, cond) + + # GH 10218 + # test DataFrame.where with Series slicing + df = DataFrame({"a": range(3), "b": range(4, 7)}) + result = df.where(df["a"] == 1) + expected = df[df["a"] == 1].reindex(df.index) + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize("klass", [list, tuple, np.array]) + def test_where_array_like(self, klass): + # see gh-15414 + df = DataFrame({"a": [1, 2, 3]}) + cond = [[False], [True], [True]] + expected = DataFrame({"a": [np.nan, 2, 3]}) + + result = df.where(klass(cond)) + tm.assert_frame_equal(result, expected) + + df["b"] = 2 + expected["b"] = [2, np.nan, 2] + cond = [[False, True], [True, False], [True, True]] + + result = df.where(klass(cond)) + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize( + "cond", + [ + [[1], [0], [1]], + Series([[2], [5], [7]]), + DataFrame({"a": [2, 5, 7]}), + [["True"], ["False"], ["True"]], + [[Timestamp("2017-01-01")], [pd.NaT], [Timestamp("2017-01-02")]], + ], + ) + def test_where_invalid_input_single(self, cond): + # see gh-15414: only boolean arrays accepted + df = DataFrame({"a": [1, 2, 3]}) + msg = "Boolean array expected for the condition" + + with pytest.raises(ValueError, match=msg): + df.where(cond) + + @pytest.mark.parametrize( + "cond", + [ + [[0, 1], [1, 0], [1, 1]], + Series([[0, 2], [5, 0], [4, 7]]), + [["False", "True"], ["True", "False"], ["True", "True"]], + DataFrame({"a": [2, 5, 7], "b": [4, 8, 9]}), + [ + [pd.NaT, Timestamp("2017-01-01")], + [Timestamp("2017-01-02"), pd.NaT], + [Timestamp("2017-01-03"), Timestamp("2017-01-03")], + ], + ], + ) + def test_where_invalid_input_multiple(self, cond): + # see gh-15414: only boolean arrays accepted + df = DataFrame({"a": [1, 2, 3], "b": [2, 2, 2]}) + msg = "Boolean array expected for the condition" + + with pytest.raises(ValueError, match=msg): + df.where(cond) + + def test_where_dataframe_col_match(self): + df = DataFrame([[1, 2, 3], [4, 5, 6]]) + cond = DataFrame([[True, False, True], [False, False, True]]) + + result = df.where(cond) + expected = DataFrame([[1.0, np.nan, 3], [np.nan, np.nan, 6]]) + tm.assert_frame_equal(result, expected) + + # this *does* align, though has no matching columns + cond.columns = ["a", "b", "c"] + result = df.where(cond) + expected = DataFrame(np.nan, index=df.index, columns=df.columns) + tm.assert_frame_equal(result, expected) + + def test_where_ndframe_align(self): + msg = "Array conditional must be same shape as self" + df = DataFrame([[1, 2, 3], [4, 5, 6]]) + + cond = [True] + with pytest.raises(ValueError, match=msg): + df.where(cond) + + expected = DataFrame([[1, 2, 3], [np.nan, np.nan, np.nan]]) + + out = df.where(Series(cond)) + tm.assert_frame_equal(out, expected) + + cond = np.array([False, True, False, True]) + with pytest.raises(ValueError, match=msg): + df.where(cond) + + expected = DataFrame([[np.nan, np.nan, np.nan], [4, 5, 6]]) + + out = df.where(Series(cond)) + tm.assert_frame_equal(out, expected) + + def test_where_bug(self): + # see gh-2793 + df = DataFrame( + {"a": [1.0, 2.0, 3.0, 4.0], "b": [4.0, 3.0, 2.0, 1.0]}, dtype="float64" + ) + expected = DataFrame( + {"a": [np.nan, np.nan, 3.0, 4.0], "b": [4.0, 3.0, np.nan, np.nan]}, + dtype="float64", + ) + result = df.where(df > 2, np.nan) + tm.assert_frame_equal(result, expected) + + result = df.copy() + result.where(result > 2, np.nan, inplace=True) + tm.assert_frame_equal(result, expected) + + def test_where_bug_mixed(self, sint_dtype): + # see gh-2793 + df = DataFrame( + { + "a": np.array([1, 2, 3, 4], dtype=sint_dtype), + "b": np.array([4.0, 3.0, 2.0, 1.0], dtype="float64"), + } + ) + + expected = DataFrame( + {"a": [np.nan, np.nan, 3.0, 4.0], "b": [4.0, 3.0, np.nan, np.nan]}, + dtype="float64", + ) + + result = df.where(df > 2, np.nan) + tm.assert_frame_equal(result, expected) + + result = df.copy() + result.where(result > 2, np.nan, inplace=True) + tm.assert_frame_equal(result, expected) + + def test_where_bug_transposition(self): + # see gh-7506 + a = DataFrame({0: [1, 2], 1: [3, 4], 2: [5, 6]}) + b = DataFrame({0: [np.nan, 8], 1: [9, np.nan], 2: [np.nan, np.nan]}) + do_not_replace = b.isna() | (a > b) + + expected = a.copy() + expected[~do_not_replace] = b + + result = a.where(do_not_replace, b) + tm.assert_frame_equal(result, expected) + + a = DataFrame({0: [4, 6], 1: [1, 0]}) + b = DataFrame({0: [np.nan, 3], 1: [3, np.nan]}) + do_not_replace = b.isna() | (a > b) + + expected = a.copy() + expected[~do_not_replace] = b + + result = a.where(do_not_replace, b) + tm.assert_frame_equal(result, expected) + + def test_where_datetime(self): + + # GH 3311 + df = DataFrame( + dict( + A=date_range("20130102", periods=5), + B=date_range("20130104", periods=5), + C=np.random.randn(5), + ) + ) + + stamp = datetime(2013, 1, 3) + with pytest.raises(TypeError): + df > stamp + + result = df[df.iloc[:, :-1] > stamp] + + expected = df.copy() + expected.loc[[0, 1], "A"] = np.nan + expected.loc[:, "C"] = np.nan + tm.assert_frame_equal(result, expected) + + def test_where_none(self): + # GH 4667 + # setting with None changes dtype + df = DataFrame({"series": Series(range(10))}).astype(float) + df[df > 7] = None + expected = DataFrame( + {"series": Series([0, 1, 2, 3, 4, 5, 6, 7, np.nan, np.nan])} + ) + tm.assert_frame_equal(df, expected) + + # GH 7656 + df = DataFrame( + [ + {"A": 1, "B": np.nan, "C": "Test"}, + {"A": np.nan, "B": "Test", "C": np.nan}, + ] + ) + msg = "boolean setting on mixed-type" + + with pytest.raises(TypeError, match=msg): + df.where(~isna(df), None, inplace=True) + + def test_where_empty_df_and_empty_cond_having_non_bool_dtypes(self): + # see gh-21947 + df = pd.DataFrame(columns=["a"]) + cond = df.applymap(lambda x: x > 0) + + result = df.where(cond) + tm.assert_frame_equal(result, df) + + def test_where_align(self): + def create(): + df = DataFrame(np.random.randn(10, 3)) + df.iloc[3:5, 0] = np.nan + df.iloc[4:6, 1] = np.nan + df.iloc[5:8, 2] = np.nan + return df + + # series + df = create() + expected = df.fillna(df.mean()) + result = df.where(pd.notna(df), df.mean(), axis="columns") + tm.assert_frame_equal(result, expected) + + df.where(pd.notna(df), df.mean(), inplace=True, axis="columns") + tm.assert_frame_equal(df, expected) + + df = create().fillna(0) + expected = df.apply(lambda x, y: x.where(x > 0, y), y=df[0]) + result = df.where(df > 0, df[0], axis="index") + tm.assert_frame_equal(result, expected) + result = df.where(df > 0, df[0], axis="rows") + tm.assert_frame_equal(result, expected) + + # frame + df = create() + expected = df.fillna(1) + result = df.where( + pd.notna(df), DataFrame(1, index=df.index, columns=df.columns) + ) + tm.assert_frame_equal(result, expected) + + def test_where_complex(self): + # GH 6345 + expected = DataFrame([[1 + 1j, 2], [np.nan, 4 + 1j]], columns=["a", "b"]) + df = DataFrame([[1 + 1j, 2], [5 + 1j, 4 + 1j]], columns=["a", "b"]) + df[df.abs() >= 5] = np.nan + tm.assert_frame_equal(df, expected) + + def test_where_axis(self): + # GH 9736 + df = DataFrame(np.random.randn(2, 2)) + mask = DataFrame([[False, False], [False, False]]) + s = Series([0, 1]) + + expected = DataFrame([[0, 0], [1, 1]], dtype="float64") + result = df.where(mask, s, axis="index") + tm.assert_frame_equal(result, expected) + + result = df.copy() + result.where(mask, s, axis="index", inplace=True) + tm.assert_frame_equal(result, expected) + + expected = DataFrame([[0, 1], [0, 1]], dtype="float64") + result = df.where(mask, s, axis="columns") + tm.assert_frame_equal(result, expected) + + result = df.copy() + result.where(mask, s, axis="columns", inplace=True) + tm.assert_frame_equal(result, expected) + + # Upcast needed + df = DataFrame([[1, 2], [3, 4]], dtype="int64") + mask = DataFrame([[False, False], [False, False]]) + s = Series([0, np.nan]) + + expected = DataFrame([[0, 0], [np.nan, np.nan]], dtype="float64") + result = df.where(mask, s, axis="index") + tm.assert_frame_equal(result, expected) + + result = df.copy() + result.where(mask, s, axis="index", inplace=True) + tm.assert_frame_equal(result, expected) + + expected = DataFrame([[0, np.nan], [0, np.nan]]) + result = df.where(mask, s, axis="columns") + tm.assert_frame_equal(result, expected) + + expected = DataFrame( + { + 0: np.array([0, 0], dtype="int64"), + 1: np.array([np.nan, np.nan], dtype="float64"), + } + ) + result = df.copy() + result.where(mask, s, axis="columns", inplace=True) + tm.assert_frame_equal(result, expected) + + # Multiple dtypes (=> multiple Blocks) + df = pd.concat( + [ + DataFrame(np.random.randn(10, 2)), + DataFrame(np.random.randint(0, 10, size=(10, 2)), dtype="int64"), + ], + ignore_index=True, + axis=1, + ) + mask = DataFrame(False, columns=df.columns, index=df.index) + s1 = Series(1, index=df.columns) + s2 = Series(2, index=df.index) + + result = df.where(mask, s1, axis="columns") + expected = DataFrame(1.0, columns=df.columns, index=df.index) + expected[2] = expected[2].astype("int64") + expected[3] = expected[3].astype("int64") + tm.assert_frame_equal(result, expected) + + result = df.copy() + result.where(mask, s1, axis="columns", inplace=True) + tm.assert_frame_equal(result, expected) + + result = df.where(mask, s2, axis="index") + expected = DataFrame(2.0, columns=df.columns, index=df.index) + expected[2] = expected[2].astype("int64") + expected[3] = expected[3].astype("int64") + tm.assert_frame_equal(result, expected) + + result = df.copy() + result.where(mask, s2, axis="index", inplace=True) + tm.assert_frame_equal(result, expected) + + # DataFrame vs DataFrame + d1 = df.copy().drop(1, axis=0) + expected = df.copy() + expected.loc[1, :] = np.nan + + result = df.where(mask, d1) + tm.assert_frame_equal(result, expected) + result = df.where(mask, d1, axis="index") + tm.assert_frame_equal(result, expected) + result = df.copy() + result.where(mask, d1, inplace=True) + tm.assert_frame_equal(result, expected) + result = df.copy() + result.where(mask, d1, inplace=True, axis="index") + tm.assert_frame_equal(result, expected) + + d2 = df.copy().drop(1, axis=1) + expected = df.copy() + expected.loc[:, 1] = np.nan + + result = df.where(mask, d2) + tm.assert_frame_equal(result, expected) + result = df.where(mask, d2, axis="columns") + tm.assert_frame_equal(result, expected) + result = df.copy() + result.where(mask, d2, inplace=True) + tm.assert_frame_equal(result, expected) + result = df.copy() + result.where(mask, d2, inplace=True, axis="columns") + tm.assert_frame_equal(result, expected) + + def test_where_callable(self): + # GH 12533 + df = DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]]) + result = df.where(lambda x: x > 4, lambda x: x + 1) + exp = DataFrame([[2, 3, 4], [5, 5, 6], [7, 8, 9]]) + tm.assert_frame_equal(result, exp) + tm.assert_frame_equal(result, df.where(df > 4, df + 1)) + + # return ndarray and scalar + result = df.where(lambda x: (x % 2 == 0).values, lambda x: 99) + exp = DataFrame([[99, 2, 99], [4, 99, 6], [99, 8, 99]]) + tm.assert_frame_equal(result, exp) + tm.assert_frame_equal(result, df.where(df % 2 == 0, 99)) + + # chain + result = (df + 2).where(lambda x: x > 8, lambda x: x + 10) + exp = DataFrame([[13, 14, 15], [16, 17, 18], [9, 10, 11]]) + tm.assert_frame_equal(result, exp) + tm.assert_frame_equal(result, (df + 2).where((df + 2) > 8, (df + 2) + 10)) + + def test_where_tz_values(self, tz_naive_fixture): + df1 = DataFrame( + DatetimeIndex(["20150101", "20150102", "20150103"], tz=tz_naive_fixture), + columns=["date"], + ) + df2 = DataFrame( + DatetimeIndex(["20150103", "20150104", "20150105"], tz=tz_naive_fixture), + columns=["date"], + ) + mask = DataFrame([True, True, False], columns=["date"]) + exp = DataFrame( + DatetimeIndex(["20150101", "20150102", "20150105"], tz=tz_naive_fixture), + columns=["date"], + ) + result = df1.where(mask, df2) + tm.assert_frame_equal(exp, result) From fe1803d9cdfbfe3023a39be94649b44854a4e596 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Tue, 19 Nov 2019 16:17:44 +0100 Subject: [PATCH 119/185] ENH: Support arrow/parquet roundtrip for nullable integer / string extension dtypes (#29483) * Add __from_arrow__ support for IntegerArray, StringArray --- doc/source/development/extending.rst | 42 ++++++++++++++++++++++ doc/source/user_guide/io.rst | 3 ++ doc/source/whatsnew/v1.0.0.rst | 3 ++ pandas/core/arrays/integer.py | 29 +++++++++++++++ pandas/core/arrays/string_.py | 18 ++++++++++ pandas/tests/arrays/string_/test_string.py | 16 +++++++++ pandas/tests/arrays/test_integer.py | 12 +++++++ pandas/tests/io/test_parquet.py | 14 +++++--- 8 files changed, 133 insertions(+), 4 deletions(-) diff --git a/doc/source/development/extending.rst b/doc/source/development/extending.rst index e341dcb8318bc..89d43e8a43825 100644 --- a/doc/source/development/extending.rst +++ b/doc/source/development/extending.rst @@ -251,6 +251,48 @@ To use a test, subclass it: See https://github.com/pandas-dev/pandas/blob/master/pandas/tests/extension/base/__init__.py for a list of all the tests available. +.. _extending.extension.arrow: + +Compatibility with Apache Arrow +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +An ``ExtensionArray`` can support conversion to / from ``pyarrow`` arrays +(and thus support for example serialization to the Parquet file format) +by implementing two methods: ``ExtensionArray.__arrow_array__`` and +``ExtensionDtype.__from_arrow__``. + +The ``ExtensionArray.__arrow_array__`` ensures that ``pyarrow`` knowns how +to convert the specific extension array into a ``pyarrow.Array`` (also when +included as a column in a pandas DataFrame): + +.. code-block:: python + + class MyExtensionArray(ExtensionArray): + ... + + def __arrow_array__(self, type=None): + # convert the underlying array values to a pyarrow Array + import pyarrow + return pyarrow.array(..., type=type) + +The ``ExtensionDtype.__from_arrow__`` method then controls the conversion +back from pyarrow to a pandas ExtensionArray. This method receives a pyarrow +``Array`` or ``ChunkedArray`` as only argument and is expected to return the +appropriate pandas ``ExtensionArray`` for this dtype and the passed values: + +.. code-block:: none + + class ExtensionDtype: + ... + + def __from_arrow__(self, array: pyarrow.Array/ChunkedArray) -> ExtensionArray: + ... + +See more in the `Arrow documentation `__. + +Those methods have been implemented for the nullable integer and string extension +dtypes included in pandas, and ensure roundtrip to pyarrow and the Parquet file format. + .. _extension dtype dtypes: https://github.com/pandas-dev/pandas/blob/master/pandas/core/dtypes/dtypes.py .. _extension dtype source: https://github.com/pandas-dev/pandas/blob/master/pandas/core/dtypes/base.py .. _extension array source: https://github.com/pandas-dev/pandas/blob/master/pandas/core/arrays/base.py diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst index 6e45d6748c2a5..fa47a5944f7bf 100644 --- a/doc/source/user_guide/io.rst +++ b/doc/source/user_guide/io.rst @@ -4716,6 +4716,9 @@ Several caveats. * The ``pyarrow`` engine preserves the ``ordered`` flag of categorical dtypes with string types. ``fastparquet`` does not preserve the ``ordered`` flag. * Non supported types include ``Period`` and actual Python object types. These will raise a helpful error message on an attempt at serialization. +* The ``pyarrow`` engine preserves extension data types such as the nullable integer and string data + type (requiring pyarrow >= 1.0.0, and requiring the extension type to implement the needed protocols, + see the :ref:`extension types documentation `). You can specify an ``engine`` to direct the serialization. This can be one of ``pyarrow``, or ``fastparquet``, or ``auto``. If the engine is NOT specified, then the ``pd.options.io.parquet.engine`` option is checked; if this is also ``auto``, diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index 54e54751a1f89..e25049cecdc09 100644 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -114,6 +114,9 @@ Other enhancements - Added ``encoding`` argument to :meth:`DataFrame.to_string` for non-ascii text (:issue:`28766`) - Added ``encoding`` argument to :func:`DataFrame.to_html` for non-ascii text (:issue:`28663`) - :meth:`Styler.background_gradient` now accepts ``vmin`` and ``vmax`` arguments (:issue:`12145`) +- Roundtripping DataFrames with nullable integer or string data types to parquet + (:meth:`~DataFrame.to_parquet` / :func:`read_parquet`) using the `'pyarrow'` engine + now preserve those data types with pyarrow >= 1.0.0 (:issue:`20612`). Build Changes ^^^^^^^^^^^^^ diff --git a/pandas/core/arrays/integer.py b/pandas/core/arrays/integer.py index af7755fb1373d..63296b4a26354 100644 --- a/pandas/core/arrays/integer.py +++ b/pandas/core/arrays/integer.py @@ -85,6 +85,35 @@ def construct_array_type(cls): """ return IntegerArray + def __from_arrow__(self, array): + """Construct IntegerArray from passed pyarrow Array/ChunkedArray""" + import pyarrow + + if isinstance(array, pyarrow.Array): + chunks = [array] + else: + # pyarrow.ChunkedArray + chunks = array.chunks + + results = [] + for arr in chunks: + buflist = arr.buffers() + data = np.frombuffer(buflist[1], dtype=self.type)[ + arr.offset : arr.offset + len(arr) + ] + bitmask = buflist[0] + if bitmask is not None: + mask = pyarrow.BooleanArray.from_buffers( + pyarrow.bool_(), len(arr), [None, bitmask] + ) + mask = np.asarray(mask) + else: + mask = np.ones(len(arr), dtype=bool) + int_arr = IntegerArray(data.copy(), ~mask, copy=False) + results.append(int_arr) + + return IntegerArray._concat_same_type(results) + def integer_array(values, dtype=None, copy=False): """ diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index 7c487b227de20..8599b5e39f34a 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -85,6 +85,24 @@ def construct_array_type(cls) -> "Type[StringArray]": def __repr__(self) -> str: return "StringDtype" + def __from_arrow__(self, array): + """Construct StringArray from passed pyarrow Array/ChunkedArray""" + import pyarrow + + if isinstance(array, pyarrow.Array): + chunks = [array] + else: + # pyarrow.ChunkedArray + chunks = array.chunks + + results = [] + for arr in chunks: + # using _from_sequence to ensure None is convered to np.nan + str_arr = StringArray._from_sequence(np.array(arr)) + results.append(str_arr) + + return StringArray._concat_same_type(results) + class StringArray(PandasArray): """ diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py index efe2b4e0b2deb..1ce62d8f8b3d9 100644 --- a/pandas/tests/arrays/string_/test_string.py +++ b/pandas/tests/arrays/string_/test_string.py @@ -171,3 +171,19 @@ def test_arrow_array(): arr = pa.array(data) expected = pa.array(list(data), type=pa.string(), from_pandas=True) assert arr.equals(expected) + + +@td.skip_if_no("pyarrow", min_version="0.15.1.dev") +def test_arrow_roundtrip(): + # roundtrip possible from arrow 1.0.0 + import pyarrow as pa + + data = pd.array(["a", "b", None], dtype="string") + df = pd.DataFrame({"a": data}) + table = pa.table(df) + assert table.field("a").type == "string" + result = table.to_pandas() + assert isinstance(result["a"].dtype, pd.StringDtype) + tm.assert_frame_equal(result, df) + # ensure the missing value is represented by NaN and not None + assert np.isnan(result.loc[2, "a"]) diff --git a/pandas/tests/arrays/test_integer.py b/pandas/tests/arrays/test_integer.py index 025366e5b210b..443a0c7e71616 100644 --- a/pandas/tests/arrays/test_integer.py +++ b/pandas/tests/arrays/test_integer.py @@ -829,6 +829,18 @@ def test_arrow_array(data): assert arr.equals(expected) +@td.skip_if_no("pyarrow", min_version="0.15.1.dev") +def test_arrow_roundtrip(data): + # roundtrip possible from arrow 1.0.0 + import pyarrow as pa + + df = pd.DataFrame({"a": data}) + table = pa.table(df) + assert table.field("a").type == str(data.dtype.numpy_dtype) + result = table.to_pandas() + tm.assert_frame_equal(result, df) + + @pytest.mark.parametrize( "pandasmethname, kwargs", [ diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py index 5dd671c659263..bcbbee3b86769 100644 --- a/pandas/tests/io/test_parquet.py +++ b/pandas/tests/io/test_parquet.py @@ -514,13 +514,19 @@ def test_additional_extension_arrays(self, pa): "b": pd.Series(["a", None, "c"], dtype="string"), } ) - # currently de-serialized as plain int / object - expected = df.assign(a=df.a.astype("int64"), b=df.b.astype("object")) + if LooseVersion(pyarrow.__version__) >= LooseVersion("0.15.1.dev"): + expected = df + else: + # de-serialized as plain int / object + expected = df.assign(a=df.a.astype("int64"), b=df.b.astype("object")) check_round_trip(df, pa, expected=expected) df = pd.DataFrame({"a": pd.Series([1, 2, 3, None], dtype="Int64")}) - # if missing values in integer, currently de-serialized as float - expected = df.assign(a=df.a.astype("float64")) + if LooseVersion(pyarrow.__version__) >= LooseVersion("0.15.1.dev"): + expected = df + else: + # if missing values in integer, currently de-serialized as float + expected = df.assign(a=df.a.astype("float64")) check_round_trip(df, pa, expected=expected) From 6b1f73e60900306a506950d8fae3312d22b9e77e Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Tue, 19 Nov 2019 10:22:37 -0600 Subject: [PATCH 120/185] Refactor string methods for StringArray + return IntegerArray for numeric results (#29640) --- doc/source/user_guide/text.rst | 37 ++++++++++- doc/source/whatsnew/v1.0.0.rst | 10 ++- pandas/_libs/lib.pyx | 24 +++++-- pandas/core/strings.py | 112 ++++++++++++++++++++++++++++----- pandas/tests/test_strings.py | 45 +++++++++++-- 5 files changed, 201 insertions(+), 27 deletions(-) diff --git a/doc/source/user_guide/text.rst b/doc/source/user_guide/text.rst index d521c745ccfe5..072871f89bdae 100644 --- a/doc/source/user_guide/text.rst +++ b/doc/source/user_guide/text.rst @@ -13,7 +13,7 @@ Text Data Types .. versionadded:: 1.0.0 -There are two main ways to store text data +There are two ways to store text data in pandas: 1. ``object`` -dtype NumPy array. 2. :class:`StringDtype` extension type. @@ -63,7 +63,40 @@ Or ``astype`` after the ``Series`` or ``DataFrame`` is created s s.astype("string") -Everything that follows in the rest of this document applies equally to +.. _text.differences: + +Behavior differences +^^^^^^^^^^^^^^^^^^^^ + +These are places where the behavior of ``StringDtype`` objects differ from +``object`` dtype + +l. For ``StringDtype``, :ref:`string accessor methods` + that return **numeric** output will always return a nullable integer dtype, + rather than either int or float dtype, depending on the presence of NA values. + + .. ipython:: python + + s = pd.Series(["a", None, "b"], dtype="string") + s + s.str.count("a") + s.dropna().str.count("a") + + Both outputs are ``Int64`` dtype. Compare that with object-dtype + + .. ipython:: python + + s.astype(object).str.count("a") + s.astype(object).dropna().str.count("a") + + When NA values are present, the output dtype is float64. + +2. Some string methods, like :meth:`Series.str.decode` are not available + on ``StringArray`` because ``StringArray`` only holds strings, not + bytes. + + +Everything else that follows in the rest of this document applies equally to ``string`` and ``object`` dtype. .. _text.string_methods: diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index e25049cecdc09..011834516eafd 100644 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -63,7 +63,7 @@ Previously, strings were typically stored in object-dtype NumPy arrays. ``StringDtype`` is currently considered experimental. The implementation and parts of the API may change without warning. -The text extension type solves several issues with object-dtype NumPy arrays: +The ``'string'`` extension type solves several issues with object-dtype NumPy arrays: 1. You can accidentally store a *mixture* of strings and non-strings in an ``object`` dtype array. A ``StringArray`` can only store strings. @@ -88,9 +88,17 @@ You can use the alias ``"string"`` as well. The usual string accessor methods work. Where appropriate, the return type of the Series or columns of a DataFrame will also have string dtype. +.. ipython:: python + s.str.upper() s.str.split('b', expand=True).dtypes +String accessor methods returning integers will return a value with :class:`Int64Dtype` + +.. ipython:: python + + s.str.count("a") + We recommend explicitly using the ``string`` data type when working with strings. See :ref:`text.types` for more. diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index c1fd46f4bba9e..aaf6456df8f8e 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -2208,9 +2208,13 @@ def maybe_convert_objects(ndarray[object] objects, bint try_float=0, return objects +_no_default = object() + + @cython.boundscheck(False) @cython.wraparound(False) -def map_infer_mask(ndarray arr, object f, const uint8_t[:] mask, bint convert=1): +def map_infer_mask(ndarray arr, object f, const uint8_t[:] mask, bint convert=1, + object na_value=_no_default, object dtype=object): """ Substitute for np.vectorize with pandas-friendly dtype inference @@ -2218,6 +2222,15 @@ def map_infer_mask(ndarray arr, object f, const uint8_t[:] mask, bint convert=1) ---------- arr : ndarray f : function + mask : ndarray + uint8 dtype ndarray indicating values not to apply `f` to. + convert : bool, default True + Whether to call `maybe_convert_objects` on the resulting ndarray + na_value : Any, optional + The result value to use for masked values. By default, the + input value is used + dtype : numpy.dtype + The numpy dtype to use for the result ndarray. Returns ------- @@ -2225,14 +2238,17 @@ def map_infer_mask(ndarray arr, object f, const uint8_t[:] mask, bint convert=1) """ cdef: Py_ssize_t i, n - ndarray[object] result + ndarray result object val n = len(arr) - result = np.empty(n, dtype=object) + result = np.empty(n, dtype=dtype) for i in range(n): if mask[i]: - val = arr[i] + if na_value is _no_default: + val = arr[i] + else: + val = na_value else: val = f(arr[i]) diff --git a/pandas/core/strings.py b/pandas/core/strings.py index 55ce44d736864..413e7e85eb6fe 100644 --- a/pandas/core/strings.py +++ b/pandas/core/strings.py @@ -2,7 +2,7 @@ from functools import wraps import re import textwrap -from typing import Dict, List +from typing import TYPE_CHECKING, Any, Callable, Dict, List import warnings import numpy as np @@ -15,10 +15,14 @@ ensure_object, is_bool_dtype, is_categorical_dtype, + is_extension_array_dtype, is_integer, + is_integer_dtype, is_list_like, + is_object_dtype, is_re, is_scalar, + is_string_dtype, ) from pandas.core.dtypes.generic import ( ABCDataFrame, @@ -28,9 +32,14 @@ ) from pandas.core.dtypes.missing import isna +from pandas._typing import ArrayLike, Dtype from pandas.core.algorithms import take_1d from pandas.core.base import NoNewAttributesMixin import pandas.core.common as com +from pandas.core.construction import extract_array + +if TYPE_CHECKING: + from pandas.arrays import StringArray _cpython_optimized_encoders = ( "utf-8", @@ -109,10 +118,79 @@ def cat_safe(list_of_columns: List, sep: str): def _na_map(f, arr, na_result=np.nan, dtype=object): # should really _check_ for NA - return _map(f, arr, na_mask=True, na_value=na_result, dtype=dtype) + if is_extension_array_dtype(arr.dtype): + # just StringDtype + arr = extract_array(arr) + return _map_stringarray(f, arr, na_value=na_result, dtype=dtype) + return _map_object(f, arr, na_mask=True, na_value=na_result, dtype=dtype) + + +def _map_stringarray( + func: Callable[[str], Any], arr: "StringArray", na_value: Any, dtype: Dtype +) -> ArrayLike: + """ + Map a callable over valid elements of a StringArrray. + + Parameters + ---------- + func : Callable[[str], Any] + Apply to each valid element. + arr : StringArray + na_value : Any + The value to use for missing values. By default, this is + the original value (NA). + dtype : Dtype + The result dtype to use. Specifying this aviods an intermediate + object-dtype allocation. + + Returns + ------- + ArrayLike + An ExtensionArray for integer or string dtypes, otherwise + an ndarray. + + """ + from pandas.arrays import IntegerArray, StringArray + + mask = isna(arr) + + assert isinstance(arr, StringArray) + arr = np.asarray(arr) + + if is_integer_dtype(dtype): + na_value_is_na = isna(na_value) + if na_value_is_na: + na_value = 1 + result = lib.map_infer_mask( + arr, + func, + mask.view("uint8"), + convert=False, + na_value=na_value, + dtype=np.dtype("int64"), + ) + + if not na_value_is_na: + mask[:] = False + + return IntegerArray(result, mask) + + elif is_string_dtype(dtype) and not is_object_dtype(dtype): + # i.e. StringDtype + result = lib.map_infer_mask( + arr, func, mask.view("uint8"), convert=False, na_value=na_value + ) + return StringArray(result) + # TODO: BooleanArray + else: + # This is when the result type is object. We reach this when + # -> We know the result type is truly object (e.g. .encode returns bytes + # or .findall returns a list). + # -> We don't know the result type. E.g. `.get` can return anything. + return lib.map_infer_mask(arr, func, mask.view("uint8")) -def _map(f, arr, na_mask=False, na_value=np.nan, dtype=object): +def _map_object(f, arr, na_mask=False, na_value=np.nan, dtype=object): if not len(arr): return np.ndarray(0, dtype=dtype) @@ -143,7 +221,7 @@ def g(x): except (TypeError, AttributeError): return na_value - return _map(g, arr, dtype=dtype) + return _map_object(g, arr, dtype=dtype) if na_value is not np.nan: np.putmask(result, mask, na_value) if result.dtype == object: @@ -634,7 +712,7 @@ def str_replace(arr, pat, repl, n=-1, case=None, flags=0, regex=True): raise ValueError("Cannot use a callable replacement when regex=False") f = lambda x: x.replace(pat, repl, n) - return _na_map(f, arr) + return _na_map(f, arr, dtype=str) def str_repeat(arr, repeats): @@ -685,7 +763,7 @@ def scalar_rep(x): except TypeError: return str.__mul__(x, repeats) - return _na_map(scalar_rep, arr) + return _na_map(scalar_rep, arr, dtype=str) else: def rep(x, r): @@ -1150,7 +1228,7 @@ def str_join(arr, sep): 4 NaN dtype: object """ - return _na_map(sep.join, arr) + return _na_map(sep.join, arr, dtype=str) def str_findall(arr, pat, flags=0): @@ -1381,7 +1459,7 @@ def str_pad(arr, width, side="left", fillchar=" "): else: # pragma: no cover raise ValueError("Invalid side") - return _na_map(f, arr) + return _na_map(f, arr, dtype=str) def str_split(arr, pat=None, n=None): @@ -1487,7 +1565,7 @@ def str_slice(arr, start=None, stop=None, step=None): """ obj = slice(start, stop, step) f = lambda x: x[obj] - return _na_map(f, arr) + return _na_map(f, arr, dtype=str) def str_slice_replace(arr, start=None, stop=None, repl=None): @@ -1578,7 +1656,7 @@ def f(x): y += x[local_stop:] return y - return _na_map(f, arr) + return _na_map(f, arr, dtype=str) def str_strip(arr, to_strip=None, side="both"): @@ -1603,7 +1681,7 @@ def str_strip(arr, to_strip=None, side="both"): f = lambda x: x.rstrip(to_strip) else: # pragma: no cover raise ValueError("Invalid side") - return _na_map(f, arr) + return _na_map(f, arr, dtype=str) def str_wrap(arr, width, **kwargs): @@ -1667,7 +1745,7 @@ def str_wrap(arr, width, **kwargs): tw = textwrap.TextWrapper(**kwargs) - return _na_map(lambda s: "\n".join(tw.wrap(s)), arr) + return _na_map(lambda s: "\n".join(tw.wrap(s)), arr, dtype=str) def str_translate(arr, table): @@ -1687,7 +1765,7 @@ def str_translate(arr, table): ------- Series or Index """ - return _na_map(lambda x: x.translate(table), arr) + return _na_map(lambda x: x.translate(table), arr, dtype=str) def str_get(arr, i): @@ -3025,7 +3103,7 @@ def normalize(self, form): import unicodedata f = lambda x: unicodedata.normalize(form, x) - result = _na_map(f, self._parent) + result = _na_map(f, self._parent, dtype=str) return self._wrap_result(result) _shared_docs[ @@ -3223,31 +3301,37 @@ def rindex(self, sub, start=0, end=None): lambda x: x.lower(), name="lower", docstring=_shared_docs["casemethods"] % _doc_args["lower"], + dtype=str, ) upper = _noarg_wrapper( lambda x: x.upper(), name="upper", docstring=_shared_docs["casemethods"] % _doc_args["upper"], + dtype=str, ) title = _noarg_wrapper( lambda x: x.title(), name="title", docstring=_shared_docs["casemethods"] % _doc_args["title"], + dtype=str, ) capitalize = _noarg_wrapper( lambda x: x.capitalize(), name="capitalize", docstring=_shared_docs["casemethods"] % _doc_args["capitalize"], + dtype=str, ) swapcase = _noarg_wrapper( lambda x: x.swapcase(), name="swapcase", docstring=_shared_docs["casemethods"] % _doc_args["swapcase"], + dtype=str, ) casefold = _noarg_wrapper( lambda x: x.casefold(), name="casefold", docstring=_shared_docs["casemethods"] % _doc_args["casefold"], + dtype=str, ) _shared_docs[ diff --git a/pandas/tests/test_strings.py b/pandas/tests/test_strings.py index f68541b620efa..1261c3bbc86db 100644 --- a/pandas/tests/test_strings.py +++ b/pandas/tests/test_strings.py @@ -731,7 +731,10 @@ def test_count(self): tm.assert_series_equal(result, exp) # mixed - mixed = ["a", np.nan, "b", True, datetime.today(), "foo", None, 1, 2.0] + mixed = np.array( + ["a", np.nan, "b", True, datetime.today(), "foo", None, 1, 2.0], + dtype=object, + ) rs = strings.str_count(mixed, "a") xp = np.array([1, np.nan, 0, np.nan, np.nan, 0, np.nan, np.nan, np.nan]) tm.assert_numpy_array_equal(rs, xp) @@ -755,14 +758,14 @@ def test_contains(self): expected = np.array([False, np.nan, False, False, True], dtype=np.object_) tm.assert_numpy_array_equal(result, expected) - values = ["foo", "xyz", "fooommm__foo", "mmm_"] + values = np.array(["foo", "xyz", "fooommm__foo", "mmm_"], dtype=object) result = strings.str_contains(values, pat) expected = np.array([False, False, True, True]) assert result.dtype == np.bool_ tm.assert_numpy_array_equal(result, expected) # case insensitive using regex - values = ["Foo", "xYz", "fOOomMm__fOo", "MMM_"] + values = np.array(["Foo", "xYz", "fOOomMm__fOo", "MMM_"], dtype=object) result = strings.str_contains(values, "FOO|mmm", case=False) expected = np.array([True, False, True, True]) tm.assert_numpy_array_equal(result, expected) @@ -773,7 +776,10 @@ def test_contains(self): tm.assert_numpy_array_equal(result, expected) # mixed - mixed = ["a", np.nan, "b", True, datetime.today(), "foo", None, 1, 2.0] + mixed = np.array( + ["a", np.nan, "b", True, datetime.today(), "foo", None, 1, 2.0], + dtype=object, + ) rs = strings.str_contains(mixed, "o") xp = np.array( [False, np.nan, False, np.nan, np.nan, True, np.nan, np.nan, np.nan], @@ -869,7 +875,10 @@ def test_endswith(self): tm.assert_series_equal(result, exp.fillna(False).astype(bool)) # mixed - mixed = ["a", np.nan, "b", True, datetime.today(), "foo", None, 1, 2.0] + mixed = np.array( + ["a", np.nan, "b", True, datetime.today(), "foo", None, 1, 2.0], + dtype=object, + ) rs = strings.str_endswith(mixed, "f") xp = np.array( [False, np.nan, False, np.nan, np.nan, False, np.nan, np.nan, np.nan], @@ -3489,10 +3498,13 @@ def test_casefold(self): def test_string_array(any_string_method): + method_name, args, kwargs = any_string_method + if method_name == "decode": + pytest.skip("decode requires bytes.") + data = ["a", "bb", np.nan, "ccc"] a = Series(data, dtype=object) b = Series(data, dtype="string") - method_name, args, kwargs = any_string_method expected = getattr(a.str, method_name)(*args, **kwargs) result = getattr(b.str, method_name)(*args, **kwargs) @@ -3503,8 +3515,29 @@ def test_string_array(any_string_method): ): assert result.dtype == "string" result = result.astype(object) + + elif expected.dtype == "float" and expected.isna().any(): + assert result.dtype == "Int64" + result = result.astype("float") + elif isinstance(expected, DataFrame): columns = expected.select_dtypes(include="object").columns assert all(result[columns].dtypes == "string") result[columns] = result[columns].astype(object) tm.assert_equal(result, expected) + + +@pytest.mark.parametrize( + "method,expected", + [ + ("count", [2, None]), + ("find", [0, None]), + ("index", [0, None]), + ("rindex", [2, None]), + ], +) +def test_string_array_numeric_integer_array(method, expected): + s = Series(["aba", None], dtype="string") + result = getattr(s.str, method)("a") + expected = Series(expected, dtype="Int64") + tm.assert_series_equal(result, expected) From 547f128eff607479c0fcda4b363595303b6272c8 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Tue, 19 Nov 2019 09:43:21 -0800 Subject: [PATCH 121/185] DEPR: enforce nested-renaming deprecation (#29608) --- doc/source/whatsnew/v1.0.0.rst | 1 + pandas/core/base.py | 44 +++---------- pandas/core/groupby/generic.py | 32 +++------ pandas/tests/frame/test_apply.py | 4 +- .../tests/groupby/aggregate/test_aggregate.py | 25 ++++--- pandas/tests/groupby/aggregate/test_other.py | 58 ++++++---------- pandas/tests/groupby/test_groupby.py | 20 +++--- pandas/tests/resample/test_resample_api.py | 66 ++++++------------- pandas/tests/series/test_apply.py | 30 +++------ pandas/tests/window/test_api.py | 43 +++--------- 10 files changed, 101 insertions(+), 222 deletions(-) diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index 011834516eafd..98d861d999ea9 100644 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -279,6 +279,7 @@ or ``matplotlib.Axes.plot``. See :ref:`plotting.formatters` for more. - Removed the previously deprecated ``reduce`` and ``broadcast`` arguments from :meth:`DataFrame.apply` (:issue:`18577`) - Removed the previously deprecated ``assert_raises_regex`` function in ``pandas.util.testing`` (:issue:`29174`) - Removed :meth:`Index.is_lexsorted_for_tuple` (:issue:`29305`) +- Removed support for nexted renaming in :meth:`DataFrame.aggregate`, :meth:`Series.aggregate`, :meth:`DataFrameGroupBy.aggregate`, :meth:`SeriesGroupBy.aggregate`, :meth:`Rolling.aggregate` (:issue:`29608`) - .. _whatsnew_1000.performance: diff --git a/pandas/core/base.py b/pandas/core/base.py index e070005c56d7a..c9855701eeb03 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -283,9 +283,7 @@ def _try_aggregate_string_function(self, arg: str, *args, **kwargs): # people may try to aggregate on a non-callable attribute # but don't let them think they can pass args to it assert len(args) == 0 - assert ( - len([kwarg for kwarg in kwargs if kwarg not in ["axis", "_level"]]) == 0 - ) + assert len([kwarg for kwarg in kwargs if kwarg not in ["axis"]]) == 0 return f f = getattr(np, arg, None) @@ -324,34 +322,17 @@ def _aggregate(self, arg, *args, **kwargs): _axis = kwargs.pop("_axis", None) if _axis is None: _axis = getattr(self, "axis", 0) - _level = kwargs.pop("_level", None) if isinstance(arg, str): return self._try_aggregate_string_function(arg, *args, **kwargs), None if isinstance(arg, dict): - # aggregate based on the passed dict if _axis != 0: # pragma: no cover raise ValueError("Can only pass dict with axis=0") obj = self._selected_obj - def nested_renaming_depr(level: int = 4): - # deprecation of nested renaming - # GH 15931 - msg = textwrap.dedent( - """\ - using a dict with renaming is deprecated and will be removed - in a future version. - - For column-specific groupby renaming, use named aggregation - - >>> df.groupby(...).agg(name=('column', aggfunc)) - """ - ) - warnings.warn(msg, FutureWarning, stacklevel=level) - # if we have a dict of any non-scalars # eg. {'A' : ['mean']}, normalize all to # be list-likes @@ -374,18 +355,9 @@ def nested_renaming_depr(level: int = 4): # not ok # {'ra' : { 'A' : 'mean' }} if isinstance(v, dict): - is_nested_renamer = True - - if k not in obj.columns: - msg = ( - "cannot perform renaming for {key} with a " - "nested dictionary" - ).format(key=k) - raise SpecificationError(msg) - nested_renaming_depr(4 + (_level or 0)) - + raise SpecificationError("nested renamer is not supported") elif isinstance(obj, ABCSeries): - nested_renaming_depr() + raise SpecificationError("nested renamer is not supported") elif isinstance(obj, ABCDataFrame) and k not in obj.columns: raise KeyError("Column '{col}' does not exist!".format(col=k)) @@ -398,7 +370,7 @@ def nested_renaming_depr(level: int = 4): if isinstance(obj, ABCDataFrame) and len( obj.columns.intersection(keys) ) != len(keys): - nested_renaming_depr() + raise SpecificationError("nested renamer is not supported") from pandas.core.reshape.concat import concat @@ -411,14 +383,14 @@ def _agg_1dim(name, how, subset=None): raise SpecificationError( "nested dictionary is ambiguous in aggregation" ) - return colg.aggregate(how, _level=(_level or 0) + 1) + return colg.aggregate(how) def _agg_2dim(name, how): """ aggregate a 2-dim with how """ colg = self._gotitem(self._selection, ndim=2, subset=obj) - return colg.aggregate(how, _level=None) + return colg.aggregate(how) def _agg(arg, func): """ @@ -535,7 +507,7 @@ def is_any_frame() -> bool: return result, True elif is_list_like(arg): # we require a list, but not an 'str' - return self._aggregate_multiple_funcs(arg, _level=_level, _axis=_axis), None + return self._aggregate_multiple_funcs(arg, _axis=_axis), None else: result = None @@ -546,7 +518,7 @@ def is_any_frame() -> bool: # caller can react return result, True - def _aggregate_multiple_funcs(self, arg, _level, _axis): + def _aggregate_multiple_funcs(self, arg, _axis): from pandas.core.reshape.concat import concat if _axis != 0: diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 31563e4bccbb7..0ca6ef043fffb 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -11,7 +11,6 @@ from textwrap import dedent import typing from typing import Any, Callable, FrozenSet, Iterable, Sequence, Type, Union, cast -import warnings import numpy as np @@ -213,7 +212,6 @@ def apply(self, func, *args, **kwargs): ) @Appender(_shared_docs["aggregate"]) def aggregate(self, func=None, *args, **kwargs): - _level = kwargs.pop("_level", None) relabeling = func is None columns = None @@ -232,7 +230,7 @@ def aggregate(self, func=None, *args, **kwargs): # Catch instances of lists / tuples # but not the class list / tuple itself. func = _maybe_mangle_lambdas(func) - ret = self._aggregate_multiple_funcs(func, (_level or 0) + 1) + ret = self._aggregate_multiple_funcs(func) if relabeling: ret.columns = columns else: @@ -256,8 +254,7 @@ def aggregate(self, func=None, *args, **kwargs): if not self.as_index: # pragma: no cover print("Warning, ignoring as_index=True") - # _level handled at higher - if not _level and isinstance(ret, dict): + if isinstance(ret, dict): from pandas import concat ret = concat(ret, axis=1) @@ -265,23 +262,14 @@ def aggregate(self, func=None, *args, **kwargs): agg = aggregate - def _aggregate_multiple_funcs(self, arg, _level): + def _aggregate_multiple_funcs(self, arg): if isinstance(arg, dict): # show the deprecation, but only if we # have not shown a higher level one # GH 15931 - if isinstance(self._selected_obj, Series) and _level <= 1: - msg = dedent( - """\ - using a dict on a Series for aggregation - is deprecated and will be removed in a future version. Use \ - named aggregation instead. - - >>> grouper.agg(name_1=func_1, name_2=func_2) - """ - ) - warnings.warn(msg, FutureWarning, stacklevel=3) + if isinstance(self._selected_obj, Series): + raise SpecificationError("nested renamer is not supported") columns = list(arg.keys()) arg = arg.items() @@ -317,8 +305,7 @@ def _aggregate_multiple_funcs(self, arg, _level): if any(isinstance(x, DataFrame) for x in results.values()): # let higher level handle - if _level: - return results + return results return DataFrame(results, columns=columns) @@ -845,7 +832,6 @@ class DataFrameGroupBy(GroupBy): ) @Appender(_shared_docs["aggregate"]) def aggregate(self, func=None, *args, **kwargs): - _level = kwargs.pop("_level", None) relabeling = func is None and _is_multi_agg_with_relabel(**kwargs) if relabeling: @@ -858,7 +844,7 @@ def aggregate(self, func=None, *args, **kwargs): func = _maybe_mangle_lambdas(func) - result, how = self._aggregate(func, _level=_level, *args, **kwargs) + result, how = self._aggregate(func, *args, **kwargs) if how is None: return result @@ -878,9 +864,7 @@ def aggregate(self, func=None, *args, **kwargs): # try to treat as if we are passing a list try: - result = self._aggregate_multiple_funcs( - [func], _level=_level, _axis=self.axis - ) + result = self._aggregate_multiple_funcs([func], _axis=self.axis) except ValueError as err: if "no results" not in str(err): # raised directly by _aggregate_multiple_funcs diff --git a/pandas/tests/frame/test_apply.py b/pandas/tests/frame/test_apply.py index ad53fcf29c57d..3c97a87c95bd2 100644 --- a/pandas/tests/frame/test_apply.py +++ b/pandas/tests/frame/test_apply.py @@ -13,6 +13,7 @@ from pandas import DataFrame, MultiIndex, Series, Timestamp, date_range, notna from pandas.conftest import _get_cython_table_params from pandas.core.apply import frame_apply +from pandas.core.base import SpecificationError import pandas.util.testing as tm @@ -1094,7 +1095,8 @@ def test_agg_dict_nested_renaming_depr(self): df = pd.DataFrame({"A": range(5), "B": 5}) # nested renaming - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + msg = r"nested renamer is not supported" + with pytest.raises(SpecificationError, match=msg): df.agg({"A": {"foo": "min"}, "B": {"bar": "max"}}) def test_agg_reduce(self, axis, float_frame): diff --git a/pandas/tests/groupby/aggregate/test_aggregate.py b/pandas/tests/groupby/aggregate/test_aggregate.py index 113c2c6d6d4ac..ea986058616d7 100644 --- a/pandas/tests/groupby/aggregate/test_aggregate.py +++ b/pandas/tests/groupby/aggregate/test_aggregate.py @@ -267,16 +267,16 @@ def bar(x): return np.std(x, ddof=1) # this uses column selection & renaming - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + msg = r"nested renamer is not supported" + with pytest.raises(SpecificationError, match=msg): d = OrderedDict( [["C", np.mean], ["D", OrderedDict([["foo", np.mean], ["bar", np.std]])]] ) - result = grouped.aggregate(d) + grouped.aggregate(d) + # But without renaming, these functions are OK d = OrderedDict([["C", [np.mean]], ["D", [foo, bar]]]) - expected = grouped.aggregate(d) - - tm.assert_frame_equal(result, expected) + grouped.aggregate(d) def test_multi_function_flexible_mix(df): @@ -288,26 +288,25 @@ def test_multi_function_flexible_mix(df): [["C", OrderedDict([["foo", "mean"], ["bar", "std"]])], ["D", {"sum": "sum"}]] ) # this uses column selection & renaming - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - expected = grouped.aggregate(d) + msg = r"nested renamer is not supported" + with pytest.raises(SpecificationError, match=msg): + grouped.aggregate(d) # Test 1 d = OrderedDict( [["C", OrderedDict([["foo", "mean"], ["bar", "std"]])], ["D", "sum"]] ) # this uses column selection & renaming - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - result = grouped.aggregate(d) - tm.assert_frame_equal(result, expected) + with pytest.raises(SpecificationError, match=msg): + grouped.aggregate(d) # Test 2 d = OrderedDict( [["C", OrderedDict([["foo", "mean"], ["bar", "std"]])], ["D", ["sum"]]] ) # this uses column selection & renaming - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - result = grouped.aggregate(d) - tm.assert_frame_equal(result, expected) + with pytest.raises(SpecificationError, match=msg): + grouped.aggregate(d) def test_groupby_agg_coercing_bools(): diff --git a/pandas/tests/groupby/aggregate/test_other.py b/pandas/tests/groupby/aggregate/test_other.py index 721045f1097f8..f14384928b979 100644 --- a/pandas/tests/groupby/aggregate/test_other.py +++ b/pandas/tests/groupby/aggregate/test_other.py @@ -211,31 +211,26 @@ def test_aggregate_api_consistency(): expected = pd.concat([c_mean, c_sum, d_mean, d_sum], axis=1) expected.columns = MultiIndex.from_product([["C", "D"], ["mean", "sum"]]) - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - result = grouped[["D", "C"]].agg({"r": np.sum, "r2": np.mean}) - expected = pd.concat([d_sum, c_sum, d_mean, c_mean], axis=1) - expected.columns = MultiIndex.from_product([["r", "r2"], ["D", "C"]]) - tm.assert_frame_equal(result, expected, check_like=True) + msg = r"nested renamer is not supported" + with pytest.raises(SpecificationError, match=msg): + grouped[["D", "C"]].agg({"r": np.sum, "r2": np.mean}) def test_agg_dict_renaming_deprecation(): # 15931 df = pd.DataFrame({"A": [1, 1, 1, 2, 2], "B": range(5), "C": range(5)}) - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False) as w: + msg = r"nested renamer is not supported" + with pytest.raises(SpecificationError, match=msg): df.groupby("A").agg( {"B": {"foo": ["sum", "max"]}, "C": {"bar": ["count", "min"]}} ) - assert "using a dict with renaming" in str(w[0].message) - assert "named aggregation" in str(w[0].message) - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + with pytest.raises(SpecificationError, match=msg): df.groupby("A")[["B", "C"]].agg({"ma": "max"}) - with tm.assert_produces_warning(FutureWarning) as w: + with pytest.raises(SpecificationError, match=msg): df.groupby("A").B.agg({"foo": "count"}) - assert "using a dict on a Series for aggregation" in str(w[0].message) - assert "named aggregation instead." in str(w[0].message) def test_agg_compat(): @@ -251,18 +246,12 @@ def test_agg_compat(): g = df.groupby(["A", "B"]) - expected = pd.concat([g["D"].sum(), g["D"].std()], axis=1) - expected.columns = MultiIndex.from_tuples([("C", "sum"), ("C", "std")]) - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - result = g["D"].agg({"C": ["sum", "std"]}) - tm.assert_frame_equal(result, expected, check_like=True) - - expected = pd.concat([g["D"].sum(), g["D"].std()], axis=1) - expected.columns = ["C", "D"] + msg = r"nested renamer is not supported" + with pytest.raises(SpecificationError, match=msg): + g["D"].agg({"C": ["sum", "std"]}) - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - result = g["D"].agg({"C": "sum", "D": "std"}) - tm.assert_frame_equal(result, expected, check_like=True) + with pytest.raises(SpecificationError, match=msg): + g["D"].agg({"C": "sum", "D": "std"}) def test_agg_nested_dicts(): @@ -278,29 +267,20 @@ def test_agg_nested_dicts(): g = df.groupby(["A", "B"]) - msg = r"cannot perform renaming for r[1-2] with a nested dictionary" + msg = r"nested renamer is not supported" with pytest.raises(SpecificationError, match=msg): g.aggregate({"r1": {"C": ["mean", "sum"]}, "r2": {"D": ["mean", "sum"]}}) - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - result = g.agg({"C": {"ra": ["mean", "std"]}, "D": {"rb": ["mean", "std"]}}) - expected = pd.concat( - [g["C"].mean(), g["C"].std(), g["D"].mean(), g["D"].std()], axis=1 - ) - expected.columns = pd.MultiIndex.from_tuples( - [("ra", "mean"), ("ra", "std"), ("rb", "mean"), ("rb", "std")] - ) - tm.assert_frame_equal(result, expected, check_like=True) + with pytest.raises(SpecificationError, match=msg): + g.agg({"C": {"ra": ["mean", "std"]}, "D": {"rb": ["mean", "std"]}}) # same name as the original column # GH9052 - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - expected = g["D"].agg({"result1": np.sum, "result2": np.mean}) - expected = expected.rename(columns={"result1": "D"}) + with pytest.raises(SpecificationError, match=msg): + g["D"].agg({"result1": np.sum, "result2": np.mean}) - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - result = g["D"].agg({"D": np.sum, "result2": np.mean}) - tm.assert_frame_equal(result, expected, check_like=True) + with pytest.raises(SpecificationError, match=msg): + g["D"].agg({"D": np.sum, "result2": np.mean}) def test_agg_item_by_item_raise_typeerror(): diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index e17181f55fdba..0d68ff36dfa20 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -10,6 +10,7 @@ import pandas as pd from pandas import DataFrame, Index, MultiIndex, Series, Timestamp, date_range, read_csv +from pandas.core.base import SpecificationError import pandas.core.common as com import pandas.util.testing as tm @@ -55,8 +56,9 @@ def test_basic(dtype): # complex agg agged = grouped.aggregate([np.mean, np.std]) - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - agged = grouped.aggregate({"one": np.mean, "two": np.std}) + msg = r"nested renamer is not supported" + with pytest.raises(SpecificationError, match=msg): + grouped.aggregate({"one": np.mean, "two": np.std}) group_constants = {0: 10, 1: 20, 2: 30} agged = grouped.agg(lambda x: group_constants[x.name] + x.mean()) @@ -452,9 +454,9 @@ def test_frame_set_name_single(df): result = grouped["C"].agg([np.mean, np.std]) assert result.index.name == "A" - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - result = grouped["C"].agg({"foo": np.mean, "bar": np.std}) - assert result.index.name == "A" + msg = r"nested renamer is not supported" + with pytest.raises(SpecificationError, match=msg): + grouped["C"].agg({"foo": np.mean, "bar": np.std}) def test_multi_func(df): @@ -602,12 +604,10 @@ def test_groupby_as_index_agg(df): tm.assert_frame_equal(result2, expected2) grouped = df.groupby("A", as_index=True) - expected3 = grouped["C"].sum() - expected3 = DataFrame(expected3).rename(columns={"C": "Q"}) - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - result3 = grouped["C"].agg({"Q": np.sum}) - tm.assert_frame_equal(result3, expected3) + msg = r"nested renamer is not supported" + with pytest.raises(SpecificationError, match=msg): + grouped["C"].agg({"Q": np.sum}) # multi-key diff --git a/pandas/tests/resample/test_resample_api.py b/pandas/tests/resample/test_resample_api.py index cbdfbb7a3100b..8e1774d8ee5b7 100644 --- a/pandas/tests/resample/test_resample_api.py +++ b/pandas/tests/resample/test_resample_api.py @@ -247,10 +247,9 @@ def test_agg_consistency(): r = df.resample("3T") - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - expected = r[["A", "B", "C"]].agg({"r1": "mean", "r2": "sum"}) - result = r.agg({"r1": "mean", "r2": "sum"}) - tm.assert_frame_equal(result, expected, check_like=True) + msg = "nested renamer is not supported" + with pytest.raises(pd.core.base.SpecificationError, match=msg): + r.agg({"r1": "mean", "r2": "sum"}) # TODO: once GH 14008 is fixed, move these tests into @@ -307,26 +306,23 @@ def test_agg(): result = t["A"].aggregate(["mean", "sum"]) tm.assert_frame_equal(result, expected) - expected = pd.concat([a_mean, a_sum], axis=1) - expected.columns = pd.MultiIndex.from_tuples([("A", "mean"), ("A", "sum")]) + msg = "nested renamer is not supported" for t in cases: - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - result = t.aggregate({"A": {"mean": "mean", "sum": "sum"}}) - tm.assert_frame_equal(result, expected, check_like=True) + with pytest.raises(pd.core.base.SpecificationError, match=msg): + t.aggregate({"A": {"mean": "mean", "sum": "sum"}}) expected = pd.concat([a_mean, a_sum, b_mean, b_sum], axis=1) expected.columns = pd.MultiIndex.from_tuples( [("A", "mean"), ("A", "sum"), ("B", "mean2"), ("B", "sum2")] ) for t in cases: - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - result = t.aggregate( + with pytest.raises(pd.core.base.SpecificationError, match=msg): + t.aggregate( { "A": {"mean": "mean", "sum": "sum"}, "B": {"mean2": "mean", "sum2": "sum"}, } ) - tm.assert_frame_equal(result, expected, check_like=True) expected = pd.concat([a_mean, a_std, b_mean, b_std], axis=1) expected.columns = pd.MultiIndex.from_tuples( @@ -383,12 +379,10 @@ def test_agg_misc(): [("result1", "A"), ("result1", "B"), ("result2", "A"), ("result2", "B")] ) + msg = "nested renamer is not supported" for t in cases: - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - result = t[["A", "B"]].agg( - OrderedDict([("result1", np.sum), ("result2", np.mean)]) - ) - tm.assert_frame_equal(result, expected, check_like=True) + with pytest.raises(pd.core.base.SpecificationError, match=msg): + t[["A", "B"]].agg(OrderedDict([("result1", np.sum), ("result2", np.mean)])) # agg with different hows expected = pd.concat( @@ -408,21 +402,11 @@ def test_agg_misc(): # series like aggs for t in cases: - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - result = t["A"].agg({"A": ["sum", "std"]}) - expected = pd.concat([t["A"].sum(), t["A"].std()], axis=1) - expected.columns = pd.MultiIndex.from_tuples([("A", "sum"), ("A", "std")]) - tm.assert_frame_equal(result, expected, check_like=True) + with pytest.raises(pd.core.base.SpecificationError, match=msg): + t["A"].agg({"A": ["sum", "std"]}) - expected = pd.concat( - [t["A"].agg(["sum", "std"]), t["A"].agg(["mean", "std"])], axis=1 - ) - expected.columns = pd.MultiIndex.from_tuples( - [("A", "sum"), ("A", "std"), ("B", "mean"), ("B", "std")] - ) - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - result = t["A"].agg({"A": ["sum", "std"], "B": ["mean", "std"]}) - tm.assert_frame_equal(result, expected, check_like=True) + with pytest.raises(pd.core.base.SpecificationError, match=msg): + t["A"].agg({"A": ["sum", "std"], "B": ["mean", "std"]}) # errors # invalid names in the agg specification @@ -451,28 +435,20 @@ def test_agg_nested_dicts(): df.groupby(pd.Grouper(freq="2D")), ] - msg = r"cannot perform renaming for r(1|2) with a nested dictionary" + msg = "nested renamer is not supported" for t in cases: with pytest.raises(pd.core.base.SpecificationError, match=msg): t.aggregate({"r1": {"A": ["mean", "sum"]}, "r2": {"B": ["mean", "sum"]}}) for t in cases: - expected = pd.concat( - [t["A"].mean(), t["A"].std(), t["B"].mean(), t["B"].std()], axis=1 - ) - expected.columns = pd.MultiIndex.from_tuples( - [("ra", "mean"), ("ra", "std"), ("rb", "mean"), ("rb", "std")] - ) - - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - result = t[["A", "B"]].agg( + + with pytest.raises(pd.core.base.SpecificationError, match=msg): + t[["A", "B"]].agg( {"A": {"ra": ["mean", "std"]}, "B": {"rb": ["mean", "std"]}} ) - tm.assert_frame_equal(result, expected, check_like=True) - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - result = t.agg({"A": {"ra": ["mean", "std"]}, "B": {"rb": ["mean", "std"]}}) - tm.assert_frame_equal(result, expected, check_like=True) + with pytest.raises(pd.core.base.SpecificationError, match=msg): + t.agg({"A": {"ra": ["mean", "std"]}, "B": {"rb": ["mean", "std"]}}) def test_try_aggregate_non_existing_column(): diff --git a/pandas/tests/series/test_apply.py b/pandas/tests/series/test_apply.py index 09c5247ef616a..bdbfa333ef33a 100644 --- a/pandas/tests/series/test_apply.py +++ b/pandas/tests/series/test_apply.py @@ -7,6 +7,7 @@ import pandas as pd from pandas import DataFrame, Index, Series, isna from pandas.conftest import _get_cython_table_params +from pandas.core.base import SpecificationError import pandas.util.testing as tm @@ -157,7 +158,8 @@ def test_apply_dict_depr(self): columns=["A", "B", "C"], index=pd.date_range("1/1/2000", periods=10), ) - with tm.assert_produces_warning(FutureWarning): + msg = "nested renamer is not supported" + with pytest.raises(SpecificationError, match=msg): tsdf.A.agg({"foo": ["sum", "mean"]}) @pytest.mark.parametrize("series", [["1-1", "1-1", np.NaN], ["1-1", "1-2", np.NaN]]) @@ -256,31 +258,17 @@ def test_demo(self): tm.assert_series_equal(result, expected) # nested renaming - with tm.assert_produces_warning(FutureWarning): - result = s.agg({"foo": ["min", "max"]}) - - expected = ( - DataFrame({"foo": [0, 5]}, index=["min", "max"]).unstack().rename("series") - ) - tm.assert_series_equal(result, expected) + msg = "nested renamer is not supported" + with pytest.raises(SpecificationError, match=msg): + s.agg({"foo": ["min", "max"]}) def test_multiple_aggregators_with_dict_api(self): s = Series(range(6), dtype="int64", name="series") # nested renaming - with tm.assert_produces_warning(FutureWarning): - result = s.agg({"foo": ["min", "max"], "bar": ["sum", "mean"]}) - - expected = ( - DataFrame( - {"foo": [5.0, np.nan, 0.0, np.nan], "bar": [np.nan, 2.5, np.nan, 15.0]}, - columns=["foo", "bar"], - index=["max", "mean", "min", "sum"], - ) - .unstack() - .rename("series") - ) - tm.assert_series_equal(result.reindex_like(expected), expected) + msg = "nested renamer is not supported" + with pytest.raises(SpecificationError, match=msg): + s.agg({"foo": ["min", "max"], "bar": ["sum", "mean"]}) def test_agg_apply_evaluate_lambdas_the_same(self, string_series): # test that we are evaluating row-by-row first diff --git a/pandas/tests/window/test_api.py b/pandas/tests/window/test_api.py index 11527efa4c39f..5085576cc96f0 100644 --- a/pandas/tests/window/test_api.py +++ b/pandas/tests/window/test_api.py @@ -1,6 +1,4 @@ from collections import OrderedDict -import warnings -from warnings import catch_warnings import numpy as np import pytest @@ -82,7 +80,6 @@ def test_agg(self): a_sum = r["A"].sum() b_mean = r["B"].mean() b_std = r["B"].std() - b_sum = r["B"].sum() result = r.aggregate([np.mean, np.std]) expected = concat([a_mean, a_std, b_mean, b_std], axis=1) @@ -104,26 +101,18 @@ def test_agg(self): expected.columns = ["mean", "sum"] tm.assert_frame_equal(result, expected) - with catch_warnings(record=True): + msg = "nested renamer is not supported" + with pytest.raises(SpecificationError, match=msg): # using a dict with renaming - warnings.simplefilter("ignore", FutureWarning) - result = r.aggregate({"A": {"mean": "mean", "sum": "sum"}}) - expected = concat([a_mean, a_sum], axis=1) - expected.columns = pd.MultiIndex.from_tuples([("A", "mean"), ("A", "sum")]) - tm.assert_frame_equal(result, expected, check_like=True) + r.aggregate({"A": {"mean": "mean", "sum": "sum"}}) - with catch_warnings(record=True): - warnings.simplefilter("ignore", FutureWarning) - result = r.aggregate( + with pytest.raises(SpecificationError, match=msg): + r.aggregate( { "A": {"mean": "mean", "sum": "sum"}, "B": {"mean2": "mean", "sum2": "sum"}, } ) - expected = concat([a_mean, a_sum, b_mean, b_sum], axis=1) - exp_cols = [("A", "mean"), ("A", "sum"), ("B", "mean2"), ("B", "sum2")] - expected.columns = pd.MultiIndex.from_tuples(exp_cols) - tm.assert_frame_equal(result, expected, check_like=True) result = r.aggregate({"A": ["mean", "std"], "B": ["mean", "std"]}) expected = concat([a_mean, a_std, b_mean, b_std], axis=1) @@ -168,7 +157,7 @@ def test_agg_nested_dicts(self): df = DataFrame({"A": range(5), "B": range(0, 10, 2)}) r = df.rolling(window=3) - msg = r"cannot perform renaming for (r1|r2) with a nested dictionary" + msg = "nested renamer is not supported" with pytest.raises(SpecificationError, match=msg): r.aggregate({"r1": {"A": ["mean", "sum"]}, "r2": {"B": ["mean", "sum"]}}) @@ -178,25 +167,13 @@ def test_agg_nested_dicts(self): expected.columns = pd.MultiIndex.from_tuples( [("ra", "mean"), ("ra", "std"), ("rb", "mean"), ("rb", "std")] ) - with catch_warnings(record=True): - warnings.simplefilter("ignore", FutureWarning) - result = r[["A", "B"]].agg( + with pytest.raises(SpecificationError, match=msg): + r[["A", "B"]].agg( {"A": {"ra": ["mean", "std"]}, "B": {"rb": ["mean", "std"]}} ) - tm.assert_frame_equal(result, expected, check_like=True) - with catch_warnings(record=True): - warnings.simplefilter("ignore", FutureWarning) - result = r.agg({"A": {"ra": ["mean", "std"]}, "B": {"rb": ["mean", "std"]}}) - expected.columns = pd.MultiIndex.from_tuples( - [ - ("A", "ra", "mean"), - ("A", "ra", "std"), - ("B", "rb", "mean"), - ("B", "rb", "std"), - ] - ) - tm.assert_frame_equal(result, expected, check_like=True) + with pytest.raises(SpecificationError, match=msg): + r.agg({"A": {"ra": ["mean", "std"]}, "B": {"rb": ["mean", "std"]}}) def test_count_nonnumeric_types(self): # GH12541 From ad4c4d513cbfbb7eff0f326897e3cca71c667924 Mon Sep 17 00:00:00 2001 From: Deepyaman Datta Date: Tue, 19 Nov 2019 15:54:13 -0500 Subject: [PATCH 122/185] Better return description in `droplevel` docstring (#29717) --- pandas/core/generic.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 982a57a6f725e..4f45a96d23941 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -807,7 +807,8 @@ def droplevel(self, level, axis=0): Returns ------- - DataFrame.droplevel() + DataFrame + DataFrame with requested index / column level(s) removed. Examples -------- From 958756af5cb40658e975a70d29089b68aea93040 Mon Sep 17 00:00:00 2001 From: Wenhuan Date: Wed, 20 Nov 2019 13:13:37 +0800 Subject: [PATCH 123/185] BUG: fix replacer's dtypes not respected for frame replace (#26632) (#29317) --- doc/source/whatsnew/v1.0.0.rst | 1 + pandas/core/internals/blocks.py | 2 ++ pandas/core/internals/managers.py | 2 +- pandas/tests/frame/test_replace.py | 16 ++++++++++++++++ 4 files changed, 20 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index 98d861d999ea9..bfc5cac7339ca 100644 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -466,6 +466,7 @@ Reshaping - Better error message in :func:`get_dummies` when `columns` isn't a list-like value (:issue:`28383`) - Bug :meth:`Series.pct_change` where supplying an anchored frequency would throw a ValueError (:issue:`28664`) - Bug where :meth:`DataFrame.equals` returned True incorrectly in some cases when two DataFrames had the same columns in different orders (:issue:`28839`) +- Bug in :meth:`DataFrame.replace` that caused non-numeric replacer's dtype not respected (:issue:`26632`) Sparse ^^^^^^ diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 7ace80415c846..5edb4d93e068a 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -2825,6 +2825,8 @@ def _replace_coerce( if convert: block = [b.convert(numeric=False, copy=True) for b in block] return block + if convert: + return [self.convert(numeric=False, copy=True)] return self diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index 0e6ba8a2c2a6a..c36dd9463c61d 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -629,7 +629,7 @@ def comp(s, regex=False): convert=convert, regex=regex, ) - if m.any(): + if m.any() or convert: new_rb = _extend_blocks(result, new_rb) else: new_rb.append(b) diff --git a/pandas/tests/frame/test_replace.py b/pandas/tests/frame/test_replace.py index 60b601b57e007..434ea6ea7b4f0 100644 --- a/pandas/tests/frame/test_replace.py +++ b/pandas/tests/frame/test_replace.py @@ -1338,5 +1338,21 @@ def test_replace_commutative(self, df, to_replace, exp): expected = pd.DataFrame(exp) result = df.replace(to_replace) + tm.assert_frame_equal(result, expected) + @pytest.mark.parametrize( + "replacer", + [ + pd.Timestamp("20170827"), + np.int8(1), + np.int16(1), + np.float32(1), + np.float64(1), + ], + ) + def test_replace_replacer_dtype(self, replacer): + # GH26632 + df = pd.DataFrame(["a"]) + result = df.replace({"a": replacer, "b": replacer}) + expected = pd.DataFrame([replacer]) tm.assert_frame_equal(result, expected) From 68af5f6cf9ed2a7a55374920682fb030a80dd923 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Wed, 20 Nov 2019 04:24:30 -0800 Subject: [PATCH 124/185] =?UTF-8?q?DEPR:=20enforce=20deprecations=20for=20?= =?UTF-8?q?kwargs=20in=20factorize,=20FrozenNDArray.ser=E2=80=A6=20(#29732?= =?UTF-8?q?)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- doc/source/whatsnew/v1.0.0.rst | 4 ++++ pandas/core/algorithms.py | 19 +++---------------- pandas/core/frame.py | 5 ----- pandas/core/indexes/datetimelike.py | 7 +++---- pandas/core/indexes/frozen.py | 3 --- pandas/tests/frame/test_combine_concat.py | 7 ------- pandas/tests/indexes/datetimes/test_ops.py | 2 -- .../tests/indexes/period/test_arithmetic.py | 2 -- pandas/tests/indexes/test_frozen.py | 3 +-- pandas/tests/test_algos.py | 2 +- 10 files changed, 12 insertions(+), 42 deletions(-) diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index bfc5cac7339ca..af16e225c1500 100644 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -280,6 +280,10 @@ or ``matplotlib.Axes.plot``. See :ref:`plotting.formatters` for more. - Removed the previously deprecated ``assert_raises_regex`` function in ``pandas.util.testing`` (:issue:`29174`) - Removed :meth:`Index.is_lexsorted_for_tuple` (:issue:`29305`) - Removed support for nexted renaming in :meth:`DataFrame.aggregate`, :meth:`Series.aggregate`, :meth:`DataFrameGroupBy.aggregate`, :meth:`SeriesGroupBy.aggregate`, :meth:`Rolling.aggregate` (:issue:`29608`) +- Removed previously deprecated "order" argument from :func:`factorize` (:issue:`19751`) +- Removed previously deprecated "v" argument from :meth:`FrozenNDarray.searchsorted`, use "value" instead (:issue:`22672`) +- Removed previously deprecated "raise_conflict" argument from :meth:`DataFrame.update`, use "errors" instead (:issue:`23585`) +- Removed previously deprecated keyword "n" from :meth:`DatetimeIndex.shift`, :meth:`TimedeltaIndex.shift`, :meth:`PeriodIndex.shift`, use "periods" instead (:issue:`22458`) - .. _whatsnew_1000.performance: diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index ea75d46048e63..e3f1ae78efcec 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -10,7 +10,7 @@ from pandas._libs import Timestamp, algos, hashtable as htable, lib from pandas._libs.tslib import iNaT -from pandas.util._decorators import Appender, Substitution, deprecate_kwarg +from pandas.util._decorators import Appender, Substitution from pandas.core.dtypes.cast import ( construct_1d_object_array_from_listlike, @@ -494,7 +494,7 @@ def _factorize_array( Parameters ---------- - %(values)s%(sort)s%(order)s + %(values)s%(sort)s na_sentinel : int, default -1 Value to mark "not found". %(size_hint)s\ @@ -585,14 +585,6 @@ def _factorize_array( coerced to ndarrays before factorization. """ ), - order=dedent( - """\ - order : None - .. deprecated:: 0.23.0 - - This parameter has no effect and is deprecated. - """ - ), sort=dedent( """\ sort : bool, default False @@ -608,13 +600,8 @@ def _factorize_array( ), ) @Appender(_shared_docs["factorize"]) -@deprecate_kwarg(old_arg_name="order", new_arg_name=None) def factorize( - values, - sort: bool = False, - order=None, - na_sentinel: int = -1, - size_hint: Optional[int] = None, + values, sort: bool = False, na_sentinel: int = -1, size_hint: Optional[int] = None, ) -> Tuple[np.ndarray, Union[np.ndarray, ABCIndex]]: # Implementation notes: This method is responsible for 3 things # 1.) coercing data to array-like (ndarray, Index, extension array) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 0b76566adf802..5baba0bae1d45 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -5528,11 +5528,6 @@ def combiner(x, y): return self.combine(other, combiner, overwrite=False) - @deprecate_kwarg( - old_arg_name="raise_conflict", - new_arg_name="errors", - mapping={False: "ignore", True: "raise"}, - ) def update( self, other, join="left", overwrite=True, filter_func=None, errors="ignore" ): diff --git a/pandas/core/indexes/datetimelike.py b/pandas/core/indexes/datetimelike.py index b8670b765ca90..df3420ea14e24 100644 --- a/pandas/core/indexes/datetimelike.py +++ b/pandas/core/indexes/datetimelike.py @@ -11,7 +11,7 @@ from pandas._libs.algos import unique_deltas from pandas.compat.numpy import function as nv from pandas.errors import AbstractMethodError -from pandas.util._decorators import Appender, cache_readonly, deprecate_kwarg +from pandas.util._decorators import Appender, cache_readonly from pandas.core.dtypes.common import ( ensure_int64, @@ -732,8 +732,7 @@ def astype(self, dtype, copy=True): # _data.astype call above return Index(new_values, dtype=new_values.dtype, name=self.name, copy=False) - @deprecate_kwarg(old_arg_name="n", new_arg_name="periods") - def shift(self, periods, freq=None): + def shift(self, periods=1, freq=None): """ Shift index by desired number of time frequency increments. @@ -742,7 +741,7 @@ def shift(self, periods, freq=None): Parameters ---------- - periods : int + periods : int, default 1 Number of periods (or increments) to shift by, can be positive or negative. diff --git a/pandas/core/indexes/frozen.py b/pandas/core/indexes/frozen.py index 1b33269d404d6..2c9521d23f71a 100644 --- a/pandas/core/indexes/frozen.py +++ b/pandas/core/indexes/frozen.py @@ -11,8 +11,6 @@ import numpy as np -from pandas.util._decorators import deprecate_kwarg - from pandas.core.dtypes.cast import coerce_indexer_dtype from pandas.core.base import PandasObject @@ -155,7 +153,6 @@ def __repr__(self) -> str: prepr = pprint_thing(self, escape_chars=("\t", "\r", "\n"), quote_strings=True) return f"{type(self).__name__}({prepr}, dtype='{self.dtype}')" - @deprecate_kwarg(old_arg_name="v", new_arg_name="value") def searchsorted(self, value, side="left", sorter=None): """ Find indices to insert `value` so as to maintain order. diff --git a/pandas/tests/frame/test_combine_concat.py b/pandas/tests/frame/test_combine_concat.py index 12d06dc517f19..e72de487abb2f 100644 --- a/pandas/tests/frame/test_combine_concat.py +++ b/pandas/tests/frame/test_combine_concat.py @@ -369,13 +369,6 @@ def test_update_raise_on_overlap(self): with pytest.raises(ValueError, match="Data overlaps"): df.update(other, errors="raise") - @pytest.mark.parametrize("raise_conflict", [True, False]) - def test_update_deprecation(self, raise_conflict): - df = DataFrame([[1.5, 1, 3.0]]) - other = DataFrame() - with tm.assert_produces_warning(FutureWarning): - df.update(other, raise_conflict=raise_conflict) - def test_update_from_non_df(self): d = {"a": Series([1, 2, 3, 4]), "b": Series([5, 6, 7, 8])} df = DataFrame(d) diff --git a/pandas/tests/indexes/datetimes/test_ops.py b/pandas/tests/indexes/datetimes/test_ops.py index 2ec267c66091b..2944767ba4c02 100644 --- a/pandas/tests/indexes/datetimes/test_ops.py +++ b/pandas/tests/indexes/datetimes/test_ops.py @@ -549,8 +549,6 @@ def test_shift_periods(self): idx = pd.date_range(start=START, end=END, periods=3) tm.assert_index_equal(idx.shift(periods=0), idx) tm.assert_index_equal(idx.shift(0), idx) - with tm.assert_produces_warning(FutureWarning, check_stacklevel=True): - tm.assert_index_equal(idx.shift(n=0), idx) def test_pickle_unpickle(self): unpickled = tm.round_trip_pickle(self.rng) diff --git a/pandas/tests/indexes/period/test_arithmetic.py b/pandas/tests/indexes/period/test_arithmetic.py index 80e4b1fe1e430..f8274a82f1b6f 100644 --- a/pandas/tests/indexes/period/test_arithmetic.py +++ b/pandas/tests/indexes/period/test_arithmetic.py @@ -117,5 +117,3 @@ def test_shift_periods(self): idx = period_range(freq="A", start="1/1/2001", end="12/1/2009") tm.assert_index_equal(idx.shift(periods=0), idx) tm.assert_index_equal(idx.shift(0), idx) - with tm.assert_produces_warning(FutureWarning, check_stacklevel=True): - tm.assert_index_equal(idx.shift(n=0), idx) diff --git a/pandas/tests/indexes/test_frozen.py b/pandas/tests/indexes/test_frozen.py index 712feb7b8ef61..c7b219b5ee890 100644 --- a/pandas/tests/indexes/test_frozen.py +++ b/pandas/tests/indexes/test_frozen.py @@ -112,5 +112,4 @@ def test_searchsorted(self): expected = 2 assert self.container.searchsorted(7) == expected - with tm.assert_produces_warning(FutureWarning): - assert self.container.searchsorted(v=7) == expected + assert self.container.searchsorted(value=7) == expected diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index baf78d7188b41..9e89a1b6f0467 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -256,7 +256,7 @@ def test_deprecate_order(self): # gh 19727 - check warning is raised for deprecated keyword, order. # Test not valid once order keyword is removed. data = np.array([2 ** 63, 1, 2 ** 63], dtype=np.uint64) - with tm.assert_produces_warning(expected_warning=FutureWarning): + with pytest.raises(TypeError, match="got an unexpected keyword"): algos.factorize(data, order=True) with tm.assert_produces_warning(False): algos.factorize(data) From 9cfd1b41c1c5b4a214b798a41eb21fbae2ccf089 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Wed, 20 Nov 2019 04:26:24 -0800 Subject: [PATCH 125/185] DEPR: change DTI.to_series keep_tz default to True (#29731) --- doc/source/whatsnew/v1.0.0.rst | 1 + pandas/core/indexes/datetimes.py | 54 ++++++++++++------------- pandas/tests/frame/test_alter_axes.py | 24 +++++------ pandas/tests/frame/test_constructors.py | 6 +-- pandas/tests/test_base.py | 2 +- 5 files changed, 43 insertions(+), 44 deletions(-) diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index af16e225c1500..9959209bbe426 100644 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -275,6 +275,7 @@ or ``matplotlib.Axes.plot``. See :ref:`plotting.formatters` for more. - :meth:`pandas.Series.str.cat` does not accept list-likes *within* list-likes anymore (:issue:`27611`) - Removed the previously deprecated :meth:`ExtensionArray._formatting_values`. Use :attr:`ExtensionArray._formatter` instead. (:issue:`23601`) - Removed the previously deprecated ``IntervalIndex.from_intervals`` in favor of the :class:`IntervalIndex` constructor (:issue:`19263`) +- Changed the default value for the "keep_tz" argument in :meth:`DatetimeIndex.to_series` to ``True`` (:issue:`23739`) - Ability to read pickles containing :class:`Categorical` instances created with pre-0.16 version of pandas has been removed (:issue:`27538`) - Removed the previously deprecated ``reduce`` and ``broadcast`` arguments from :meth:`DataFrame.apply` (:issue:`18577`) - Removed the previously deprecated ``assert_raises_regex`` function in ``pandas.util.testing`` (:issue:`29174`) diff --git a/pandas/core/indexes/datetimes.py b/pandas/core/indexes/datetimes.py index 4a95f0a2ab7e9..b6891bc7e2b59 100644 --- a/pandas/core/indexes/datetimes.py +++ b/pandas/core/indexes/datetimes.py @@ -663,14 +663,14 @@ def _get_time_micros(self): values = self._data._local_timestamps() return fields.get_time_micros(values) - def to_series(self, keep_tz=None, index=None, name=None): + def to_series(self, keep_tz=lib._no_default, index=None, name=None): """ Create a Series with both index and values equal to the index keys useful with map for returning an indexer based on an index. Parameters ---------- - keep_tz : optional, defaults False + keep_tz : optional, defaults True Return the data keeping the timezone. If keep_tz is True: @@ -686,10 +686,10 @@ def to_series(self, keep_tz=None, index=None, name=None): Series will have a datetime64[ns] dtype. TZ aware objects will have the tz removed. - .. versionchanged:: 0.24 - The default value will change to True in a future release. - You can set ``keep_tz=True`` to already obtain the future - behaviour and silence the warning. + .. versionchanged:: 1.0.0 + The default value is now True. In a future version, + this keyword will be removed entirely. Stop passing the + argument to obtain the future behavior and silence the warning. index : Index, optional Index of resulting Series. If None, defaults to original index. @@ -708,27 +708,27 @@ def to_series(self, keep_tz=None, index=None, name=None): if name is None: name = self.name - if keep_tz is None and self.tz is not None: - warnings.warn( - "The default of the 'keep_tz' keyword in " - "DatetimeIndex.to_series will change " - "to True in a future release. You can set " - "'keep_tz=True' to obtain the future behaviour and " - "silence this warning.", - FutureWarning, - stacklevel=2, - ) - keep_tz = False - elif keep_tz is False: - warnings.warn( - "Specifying 'keep_tz=False' is deprecated and this " - "option will be removed in a future release. If " - "you want to remove the timezone information, you " - "can do 'idx.tz_convert(None)' before calling " - "'to_series'.", - FutureWarning, - stacklevel=2, - ) + if keep_tz is not lib._no_default: + if keep_tz: + warnings.warn( + "The 'keep_tz' keyword in DatetimeIndex.to_series " + "is deprecated and will be removed in a future version. " + "You can stop passing 'keep_tz' to silence this warning.", + FutureWarning, + stacklevel=2, + ) + else: + warnings.warn( + "Specifying 'keep_tz=False' is deprecated and this " + "option will be removed in a future release. If " + "you want to remove the timezone information, you " + "can do 'idx.tz_convert(None)' before calling " + "'to_series'.", + FutureWarning, + stacklevel=2, + ) + else: + keep_tz = True if keep_tz and self.tz is not None: # preserve the tz & copy diff --git a/pandas/tests/frame/test_alter_axes.py b/pandas/tests/frame/test_alter_axes.py index 21470151dcfbd..6206b333d29e1 100644 --- a/pandas/tests/frame/test_alter_axes.py +++ b/pandas/tests/frame/test_alter_axes.py @@ -493,29 +493,29 @@ def test_convert_dti_to_series(self): tm.assert_series_equal(result, expected) # convert to series while keeping the timezone - result = idx.to_series(keep_tz=True, index=[0, 1]) + msg = "stop passing 'keep_tz'" + with tm.assert_produces_warning(FutureWarning) as m: + result = idx.to_series(keep_tz=True, index=[0, 1]) tm.assert_series_equal(result, expected) + assert msg in str(m[0].message) # convert to utc - with tm.assert_produces_warning(FutureWarning): + with tm.assert_produces_warning(FutureWarning) as m: df["B"] = idx.to_series(keep_tz=False, index=[0, 1]) result = df["B"] comp = Series(DatetimeIndex(expected.values).tz_localize(None), name="B") tm.assert_series_equal(result, comp) - - with tm.assert_produces_warning(FutureWarning) as m: - result = idx.to_series(index=[0, 1]) - tm.assert_series_equal(result, expected.dt.tz_convert(None)) - msg = ( - "The default of the 'keep_tz' keyword in " - "DatetimeIndex.to_series will change to True in a future " - "release." - ) + msg = "do 'idx.tz_convert(None)' before calling" assert msg in str(m[0].message) - with tm.assert_produces_warning(FutureWarning): + result = idx.to_series(index=[0, 1]) + tm.assert_series_equal(result, expected) + + with tm.assert_produces_warning(FutureWarning) as m: result = idx.to_series(keep_tz=False, index=[0, 1]) tm.assert_series_equal(result, expected.dt.tz_convert(None)) + msg = "do 'idx.tz_convert(None)' before calling" + assert msg in str(m[0].message) # list of datetimes with a tz df["B"] = idx.to_pydatetime() diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py index cccce96a874dd..cc2e37c14bdf0 100644 --- a/pandas/tests/frame/test_constructors.py +++ b/pandas/tests/frame/test_constructors.py @@ -1795,7 +1795,7 @@ def test_constructor_with_datetimes(self): # preserver an index with a tz on dict construction i = date_range("1/1/2011", periods=5, freq="10s", tz="US/Eastern") - expected = DataFrame({"a": i.to_series(keep_tz=True).reset_index(drop=True)}) + expected = DataFrame({"a": i.to_series().reset_index(drop=True)}) df = DataFrame() df["a"] = i tm.assert_frame_equal(df, expected) @@ -1806,9 +1806,7 @@ def test_constructor_with_datetimes(self): # multiples i_no_tz = date_range("1/1/2011", periods=5, freq="10s") df = DataFrame({"a": i, "b": i_no_tz}) - expected = DataFrame( - {"a": i.to_series(keep_tz=True).reset_index(drop=True), "b": i_no_tz} - ) + expected = DataFrame({"a": i.to_series().reset_index(drop=True), "b": i_no_tz}) tm.assert_frame_equal(df, expected) def test_constructor_datetimes_with_nulls(self): diff --git a/pandas/tests/test_base.py b/pandas/tests/test_base.py index d9bdceb258592..58093ba4d90a5 100644 --- a/pandas/tests/test_base.py +++ b/pandas/tests/test_base.py @@ -179,7 +179,7 @@ def setup_method(self, method): self.int_series = Series(arr, index=self.int_index, name="a") self.float_series = Series(arr, index=self.float_index, name="a") self.dt_series = Series(arr, index=self.dt_index, name="a") - self.dt_tz_series = self.dt_tz_index.to_series(keep_tz=True) + self.dt_tz_series = self.dt_tz_index.to_series() self.period_series = Series(arr, index=self.period_index, name="a") self.string_series = Series(arr, index=self.string_index, name="a") self.unicode_series = Series(arr, index=self.unicode_index, name="a") From 2b0cac7d90ee9797194a55fb186743c00fc6d46a Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Wed, 20 Nov 2019 06:36:06 -0600 Subject: [PATCH 126/185] TST: Silence lzma output (#29713) This was leaking to stdout when the pytest `-s` flag was used. --- pandas/tests/io/test_compression.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/tests/io/test_compression.py b/pandas/tests/io/test_compression.py index d68b6a1effaa0..9bcdda2039458 100644 --- a/pandas/tests/io/test_compression.py +++ b/pandas/tests/io/test_compression.py @@ -140,7 +140,7 @@ def test_with_missing_lzma(): import pandas """ ) - subprocess.check_output([sys.executable, "-c", code]) + subprocess.check_output([sys.executable, "-c", code], stderr=subprocess.PIPE) def test_with_missing_lzma_runtime(): @@ -157,4 +157,4 @@ def test_with_missing_lzma_runtime(): df.to_csv('foo.csv', compression='xz') """ ) - subprocess.check_output([sys.executable, "-c", code]) + subprocess.check_output([sys.executable, "-c", code], stderr=subprocess.PIPE) From c5a1f9e2c373ced9ef2f02ab64d11eaa7b4248f2 Mon Sep 17 00:00:00 2001 From: Oliver Hofkens Date: Wed, 20 Nov 2019 13:46:18 +0100 Subject: [PATCH 127/185] BUG: Series groupby does not include nan counts for all categorical labels (#17605) (#29690) --- doc/source/whatsnew/v1.0.0.rst | 41 ++++++++++++ pandas/core/groupby/generic.py | 6 +- pandas/core/groupby/groupby.py | 22 +++++-- pandas/tests/groupby/test_categorical.py | 79 ++++++++++++++++++++++++ 4 files changed, 141 insertions(+), 7 deletions(-) diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index 9959209bbe426..cf3dfe04f7c74 100644 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -194,6 +194,47 @@ New repr for :class:`pandas.core.arrays.IntervalArray` pd.arrays.IntervalArray.from_tuples([(0, 1), (2, 3)]) + +All :class:`SeriesGroupBy` aggregation methods now respect the ``observed`` keyword +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +The following methods now also correctly output values for unobserved categories when called through ``groupby(..., observed=False)`` (:issue:`17605`) + +- :meth:`SeriesGroupBy.count` +- :meth:`SeriesGroupBy.size` +- :meth:`SeriesGroupBy.nunique` +- :meth:`SeriesGroupBy.nth` + +.. ipython:: python + + df = pd.DataFrame({ + "cat_1": pd.Categorical(list("AABB"), categories=list("ABC")), + "cat_2": pd.Categorical(list("AB") * 2, categories=list("ABC")), + "value": [0.1] * 4, + }) + df + + +*pandas 0.25.x* + +.. code-block:: ipython + + In [2]: df.groupby(["cat_1", "cat_2"], observed=False)["value"].count() + Out[2]: + cat_1 cat_2 + A A 1 + B 1 + B A 1 + B 1 + Name: value, dtype: int64 + + +*pandas 1.0.0* + +.. ipython:: python + + df.groupby(["cat_1", "cat_2"], observed=False)["value"].count() + + .. _whatsnew_1000.api.other: Other API changes diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 0ca6ef043fffb..d1894faadbcd1 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -557,7 +557,8 @@ def nunique(self, dropna: bool = True) -> Series: res, out = np.zeros(len(ri), dtype=out.dtype), res res[ids[idx]] = out - return Series(res, index=ri, name=self._selection_name) + result = Series(res, index=ri, name=self._selection_name) + return self._reindex_output(result, fill_value=0) @Appender(Series.describe.__doc__) def describe(self, **kwargs): @@ -709,12 +710,13 @@ def count(self) -> Series: minlength = ngroups or 0 out = np.bincount(ids[mask], minlength=minlength) - return Series( + result = Series( out, index=self.grouper.result_index, name=self._selection_name, dtype="int64", ) + return self._reindex_output(result, fill_value=0) def _apply_to_column_groupbys(self, func): """ return a pass thru """ diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 99a4942df4f7f..75bb818eacf4b 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -39,6 +39,7 @@ class providing the base-class of operations. ) from pandas.core.dtypes.missing import isna, notna +from pandas._typing import FrameOrSeries, Scalar from pandas.core import nanops import pandas.core.algorithms as algorithms from pandas.core.arrays import Categorical, try_cast_to_ea @@ -1296,7 +1297,7 @@ def size(self): if isinstance(self.obj, Series): result.name = self.obj.name - return result + return self._reindex_output(result, fill_value=0) @classmethod def _add_numeric_operations(cls): @@ -1740,6 +1741,7 @@ def nth(self, n: Union[int, List[int]], dropna: Optional[str] = None) -> DataFra if not self.observed and isinstance(result_index, CategoricalIndex): out = out.reindex(result_index) + out = self._reindex_output(out) return out.sort_index() if self.sort else out # dropna is truthy @@ -2380,7 +2382,9 @@ def tail(self, n=5): mask = self._cumcount_array(ascending=False) < n return self._selected_obj[mask] - def _reindex_output(self, output): + def _reindex_output( + self, output: FrameOrSeries, fill_value: Scalar = np.NaN + ) -> FrameOrSeries: """ If we have categorical groupers, then we might want to make sure that we have a fully re-indexed output to the levels. This means expanding @@ -2394,8 +2398,10 @@ def _reindex_output(self, output): Parameters ---------- - output: Series or DataFrame + output : Series or DataFrame Object resulting from grouping and applying an operation. + fill_value : scalar, default np.NaN + Value to use for unobserved categories if self.observed is False. Returns ------- @@ -2426,7 +2432,11 @@ def _reindex_output(self, output): ).sortlevel() if self.as_index: - d = {self.obj._get_axis_name(self.axis): index, "copy": False} + d = { + self.obj._get_axis_name(self.axis): index, + "copy": False, + "fill_value": fill_value, + } return output.reindex(**d) # GH 13204 @@ -2448,7 +2458,9 @@ def _reindex_output(self, output): output = output.drop(labels=list(g_names), axis=1) # Set a temp index and reindex (possibly expanding) - output = output.set_index(self.grouper.result_index).reindex(index, copy=False) + output = output.set_index(self.grouper.result_index).reindex( + index, copy=False, fill_value=fill_value + ) # Reset in-axis grouper columns # (using level numbers `g_nums` because level names may not be unique) diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py index 663e03aa1bc81..5f78e4860f1e9 100644 --- a/pandas/tests/groupby/test_categorical.py +++ b/pandas/tests/groupby/test_categorical.py @@ -1252,3 +1252,82 @@ def test_get_nonexistent_category(): {"var": [rows.iloc[-1]["var"]], "val": [rows.iloc[-1]["vau"]]} ) ) + + +def test_series_groupby_on_2_categoricals_unobserved( + reduction_func: str, observed: bool +): + # GH 17605 + + if reduction_func == "ngroup": + pytest.skip("ngroup is not truly a reduction") + + df = pd.DataFrame( + { + "cat_1": pd.Categorical(list("AABB"), categories=list("ABCD")), + "cat_2": pd.Categorical(list("AB") * 2, categories=list("ABCD")), + "value": [0.1] * 4, + } + ) + args = {"nth": [0]}.get(reduction_func, []) + + expected_length = 4 if observed else 16 + + series_groupby = df.groupby(["cat_1", "cat_2"], observed=observed)["value"] + agg = getattr(series_groupby, reduction_func) + result = agg(*args) + + assert len(result) == expected_length + + +@pytest.mark.parametrize( + "func, zero_or_nan", + [ + ("all", np.NaN), + ("any", np.NaN), + ("count", 0), + ("first", np.NaN), + ("idxmax", np.NaN), + ("idxmin", np.NaN), + ("last", np.NaN), + ("mad", np.NaN), + ("max", np.NaN), + ("mean", np.NaN), + ("median", np.NaN), + ("min", np.NaN), + ("nth", np.NaN), + ("nunique", 0), + ("prod", np.NaN), + ("quantile", np.NaN), + ("sem", np.NaN), + ("size", 0), + ("skew", np.NaN), + ("std", np.NaN), + ("sum", np.NaN), + ("var", np.NaN), + ], +) +def test_series_groupby_on_2_categoricals_unobserved_zeroes_or_nans(func, zero_or_nan): + # GH 17605 + # Tests whether the unobserved categories in the result contain 0 or NaN + df = pd.DataFrame( + { + "cat_1": pd.Categorical(list("AABB"), categories=list("ABC")), + "cat_2": pd.Categorical(list("AB") * 2, categories=list("ABC")), + "value": [0.1] * 4, + } + ) + unobserved = [tuple("AC"), tuple("BC"), tuple("CA"), tuple("CB"), tuple("CC")] + args = {"nth": [0]}.get(func, []) + + series_groupby = df.groupby(["cat_1", "cat_2"], observed=False)["value"] + agg = getattr(series_groupby, func) + result = agg(*args) + + for idx in unobserved: + val = result.loc[idx] + assert (pd.isna(zero_or_nan) and pd.isna(val)) or (val == zero_or_nan) + + # If we expect unobserved values to be zero, we also expect the dtype to be int + if zero_or_nan == 0: + assert np.issubdtype(result.dtype, np.integer) From 7e2bbd1a75a10d4078157c0b527cef19f12a0ffc Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Wed, 20 Nov 2019 04:49:24 -0800 Subject: [PATCH 128/185] TST: add test case for user-defined function taking correct path in groupby transform (#29631) --- pandas/core/groupby/generic.py | 6 +++--- pandas/tests/groupby/test_transform.py | 30 ++++++++++++++++++++++++++ 2 files changed, 33 insertions(+), 3 deletions(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index d1894faadbcd1..0408fa6945cec 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -1382,7 +1382,7 @@ def _define_paths(self, func, *args, **kwargs): ) return fast_path, slow_path - def _choose_path(self, fast_path, slow_path, group): + def _choose_path(self, fast_path: Callable, slow_path: Callable, group: DataFrame): path = slow_path res = slow_path(group) @@ -1392,8 +1392,8 @@ def _choose_path(self, fast_path, slow_path, group): except AssertionError: raise except Exception: - # Hard to know ex-ante what exceptions `fast_path` might raise - # TODO: no test cases get here + # GH#29631 For user-defined function, we cant predict what may be + # raised; see test_transform.test_transform_fastpath_raises return path, res # verify fast path does not change columns (and names), otherwise diff --git a/pandas/tests/groupby/test_transform.py b/pandas/tests/groupby/test_transform.py index db44a4a57230c..3d9a349d94e10 100644 --- a/pandas/tests/groupby/test_transform.py +++ b/pandas/tests/groupby/test_transform.py @@ -1073,3 +1073,33 @@ def test_transform_lambda_with_datetimetz(): name="time", ) tm.assert_series_equal(result, expected) + + +def test_transform_fastpath_raises(): + # GH#29631 case where fastpath defined in groupby.generic _choose_path + # raises, but slow_path does not + + df = pd.DataFrame({"A": [1, 1, 2, 2], "B": [1, -1, 1, 2]}) + gb = df.groupby("A") + + def func(grp): + # we want a function such that func(frame) fails but func.apply(frame) + # works + if grp.ndim == 2: + # Ensure that fast_path fails + raise NotImplementedError("Don't cross the streams") + return grp * 2 + + # Check that the fastpath raises, see _transform_general + obj = gb._obj_with_exclusions + gen = gb.grouper.get_iterator(obj, axis=gb.axis) + fast_path, slow_path = gb._define_paths(func) + _, group = next(gen) + + with pytest.raises(NotImplementedError, match="Don't cross the streams"): + fast_path(group) + + result = gb.transform(func) + + expected = pd.DataFrame([2, -2, 2, 4], columns=["B"]) + tm.assert_frame_equal(result, expected) From 63fd590a6585199edbab51fcdd6363ff8eeb93a1 Mon Sep 17 00:00:00 2001 From: William Ayd Date: Wed, 20 Nov 2019 04:58:14 -0800 Subject: [PATCH 129/185] Assorted io extension cleanups (#29704) --- pandas/_libs/src/parser/io.c | 1 - pandas/_libs/src/parser/tokenizer.c | 51 ++++++++--------------------- pandas/_libs/src/parser/tokenizer.h | 6 ---- 3 files changed, 13 insertions(+), 45 deletions(-) diff --git a/pandas/_libs/src/parser/io.c b/pandas/_libs/src/parser/io.c index aecd4e03664e6..1e3295fcb6fc7 100644 --- a/pandas/_libs/src/parser/io.c +++ b/pandas/_libs/src/parser/io.c @@ -9,7 +9,6 @@ The full license is in the LICENSE file, distributed with this software. #include "io.h" -#include #include #include diff --git a/pandas/_libs/src/parser/tokenizer.c b/pandas/_libs/src/parser/tokenizer.c index 83869a1d9c342..578f72112d02d 100644 --- a/pandas/_libs/src/parser/tokenizer.c +++ b/pandas/_libs/src/parser/tokenizer.c @@ -25,19 +25,6 @@ GitHub. See Python Software Foundation License and BSD licenses for these. #include "../headers/portable.h" -static void *safe_realloc(void *buffer, size_t size) { - void *result; - // OSX is weird. - // http://stackoverflow.com/questions/9560609/ - // different-realloc-behaviour-in-linux-and-osx - - result = realloc(buffer, size); - TRACE(("safe_realloc: buffer = %p, size = %zu, result = %p\n", buffer, size, - result)) - - return result; -} - void coliter_setup(coliter_t *self, parser_t *parser, int i, int start) { // column i, starting at 0 self->words = parser->words; @@ -45,18 +32,6 @@ void coliter_setup(coliter_t *self, parser_t *parser, int i, int start) { self->line_start = parser->line_start + start; } -coliter_t *coliter_new(parser_t *self, int i) { - // column i, starting at 0 - coliter_t *iter = (coliter_t *)malloc(sizeof(coliter_t)); - - if (NULL == iter) { - return NULL; - } - - coliter_setup(iter, self, i, 0); - return iter; -} - static void free_if_not_null(void **ptr) { TRACE(("free_if_not_null %p\n", *ptr)) if (*ptr != NULL) { @@ -80,7 +55,7 @@ static void *grow_buffer(void *buffer, uint64_t length, uint64_t *capacity, while ((length + space >= cap) && (newbuffer != NULL)) { cap = cap ? cap << 1 : 2; buffer = newbuffer; - newbuffer = safe_realloc(newbuffer, elsize * cap); + newbuffer = realloc(newbuffer, elsize * cap); } if (newbuffer == NULL) { @@ -321,8 +296,8 @@ static int make_stream_space(parser_t *self, size_t nbytes) { ("make_stream_space: cap != self->words_cap, nbytes = %d, " "self->words_cap=%d\n", nbytes, self->words_cap)) - newptr = safe_realloc((void *)self->word_starts, - sizeof(int64_t) * self->words_cap); + newptr = realloc((void *)self->word_starts, + sizeof(int64_t) * self->words_cap); if (newptr == NULL) { return PARSER_OUT_OF_MEMORY; } else { @@ -349,8 +324,8 @@ static int make_stream_space(parser_t *self, size_t nbytes) { if (cap != self->lines_cap) { TRACE(("make_stream_space: cap != self->lines_cap, nbytes = %d\n", nbytes)) - newptr = safe_realloc((void *)self->line_fields, - sizeof(int64_t) * self->lines_cap); + newptr = realloc((void *)self->line_fields, + sizeof(int64_t) * self->lines_cap); if (newptr == NULL) { return PARSER_OUT_OF_MEMORY; } else { @@ -427,7 +402,7 @@ static void append_warning(parser_t *self, const char *msg) { snprintf(self->warn_msg, length + 1, "%s", msg); } else { ex_length = strlen(self->warn_msg); - newptr = safe_realloc(self->warn_msg, ex_length + length + 1); + newptr = realloc(self->warn_msg, ex_length + length + 1); if (newptr != NULL) { self->warn_msg = (char *)newptr; snprintf(self->warn_msg + ex_length, length + 1, "%s", msg); @@ -1290,13 +1265,13 @@ int parser_trim_buffers(parser_t *self) { new_cap = _next_pow2(self->words_len) + 1; if (new_cap < self->words_cap) { TRACE(("parser_trim_buffers: new_cap < self->words_cap\n")); - newptr = safe_realloc((void *)self->words, new_cap * sizeof(char *)); + newptr = realloc((void *)self->words, new_cap * sizeof(char *)); if (newptr == NULL) { return PARSER_OUT_OF_MEMORY; } else { self->words = (char **)newptr; } - newptr = safe_realloc((void *)self->word_starts, + newptr = realloc((void *)self->word_starts, new_cap * sizeof(int64_t)); if (newptr == NULL) { return PARSER_OUT_OF_MEMORY; @@ -1315,13 +1290,13 @@ int parser_trim_buffers(parser_t *self) { if (new_cap < self->stream_cap) { TRACE( ("parser_trim_buffers: new_cap < self->stream_cap, calling " - "safe_realloc\n")); - newptr = safe_realloc((void *)self->stream, new_cap); + "realloc\n")); + newptr = realloc((void *)self->stream, new_cap); if (newptr == NULL) { return PARSER_OUT_OF_MEMORY; } else { // Update the pointers in the self->words array (char **) if - // `safe_realloc` + // `realloc` // moved the `self->stream` buffer. This block mirrors a similar // block in // `make_stream_space`. @@ -1342,14 +1317,14 @@ int parser_trim_buffers(parser_t *self) { new_cap = _next_pow2(self->lines) + 1; if (new_cap < self->lines_cap) { TRACE(("parser_trim_buffers: new_cap < self->lines_cap\n")); - newptr = safe_realloc((void *)self->line_start, + newptr = realloc((void *)self->line_start, new_cap * sizeof(int64_t)); if (newptr == NULL) { return PARSER_OUT_OF_MEMORY; } else { self->line_start = (int64_t *)newptr; } - newptr = safe_realloc((void *)self->line_fields, + newptr = realloc((void *)self->line_fields, new_cap * sizeof(int64_t)); if (newptr == NULL) { return PARSER_OUT_OF_MEMORY; diff --git a/pandas/_libs/src/parser/tokenizer.h b/pandas/_libs/src/parser/tokenizer.h index 4903e936dc348..b37de47662feb 100644 --- a/pandas/_libs/src/parser/tokenizer.h +++ b/pandas/_libs/src/parser/tokenizer.h @@ -15,7 +15,6 @@ See LICENSE for the license #define PY_SSIZE_T_CLEAN #include -#define ERROR_OK 0 #define ERROR_NO_DIGITS 1 #define ERROR_OVERFLOW 2 #define ERROR_INVALID_CHARS 3 @@ -32,10 +31,6 @@ See LICENSE for the license #define CALLING_READ_FAILED 2 -#if defined(_MSC_VER) -#define strtoll _strtoi64 -#endif // _MSC_VER - /* C flat file parsing low level code for pandas / NumPy @@ -180,7 +175,6 @@ typedef struct coliter_t { } coliter_t; void coliter_setup(coliter_t *self, parser_t *parser, int i, int start); -coliter_t *coliter_new(parser_t *self, int i); #define COLITER_NEXT(iter, word) \ do { \ From bbc717309a3383a4f22c642012e28bcd36c8d995 Mon Sep 17 00:00:00 2001 From: William Ayd Date: Wed, 20 Nov 2019 05:00:45 -0800 Subject: [PATCH 130/185] Fixed Inconsistent GroupBy Output Shape with Duplicate Column Labels (#29124) --- doc/source/whatsnew/v1.0.0.rst | 1 + pandas/core/groupby/base.py | 4 + pandas/core/groupby/generic.py | 175 +++++++++++++++++++++------ pandas/core/groupby/groupby.py | 79 ++++++++---- pandas/core/groupby/ops.py | 11 +- pandas/tests/groupby/conftest.py | 14 ++- pandas/tests/groupby/test_groupby.py | 36 ++++++ 7 files changed, 255 insertions(+), 65 deletions(-) diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index cf3dfe04f7c74..f158c1158b54e 100644 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -499,6 +499,7 @@ Groupby/resample/rolling - Bug in :meth:`DataFrameGroupby.agg` not able to use lambda function with named aggregation (:issue:`27519`) - Bug in :meth:`DataFrame.groupby` losing column name information when grouping by a categorical column (:issue:`28787`) - Bug in :meth:`DataFrameGroupBy.rolling().quantile()` ignoring ``interpolation`` keyword argument (:issue:`28779`) +- Bug in :meth:`DataFrame.groupby` where ``any``, ``all``, ``nunique`` and transform functions would incorrectly handle duplicate column labels (:issue:`21668`) Reshaping ^^^^^^^^^ diff --git a/pandas/core/groupby/base.py b/pandas/core/groupby/base.py index fed387cbeade4..407cd8342d486 100644 --- a/pandas/core/groupby/base.py +++ b/pandas/core/groupby/base.py @@ -3,8 +3,12 @@ hold the whitelist of methods that are exposed on the SeriesGroupBy and the DataFrameGroupBy objects. """ +import collections + from pandas.core.dtypes.common import is_list_like, is_scalar +OutputKey = collections.namedtuple("OutputKey", ["label", "position"]) + class GroupByMixin: """ diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 0408fa6945cec..7d3bf3d3dcd2f 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -10,7 +10,17 @@ from functools import partial from textwrap import dedent import typing -from typing import Any, Callable, FrozenSet, Iterable, Sequence, Type, Union, cast +from typing import ( + Any, + Callable, + FrozenSet, + Iterable, + Mapping, + Sequence, + Type, + Union, + cast, +) import numpy as np @@ -309,28 +319,91 @@ def _aggregate_multiple_funcs(self, arg): return DataFrame(results, columns=columns) - def _wrap_series_output(self, output, index, names=None): - """ common agg/transform wrapping logic """ - output = output[self._selection_name] + def _wrap_series_output( + self, output: Mapping[base.OutputKey, Union[Series, np.ndarray]], index: Index, + ) -> Union[Series, DataFrame]: + """ + Wraps the output of a SeriesGroupBy operation into the expected result. + + Parameters + ---------- + output : Mapping[base.OutputKey, Union[Series, np.ndarray]] + Data to wrap. + index : pd.Index + Index to apply to the output. - if names is not None: - return DataFrame(output, index=index, columns=names) + Returns + ------- + Series or DataFrame + + Notes + ----- + In the vast majority of cases output and columns will only contain one + element. The exception is operations that expand dimensions, like ohlc. + """ + indexed_output = {key.position: val for key, val in output.items()} + columns = Index(key.label for key in output) + + result: Union[Series, DataFrame] + if len(output) > 1: + result = DataFrame(indexed_output, index=index) + result.columns = columns else: - name = self._selection_name - if name is None: - name = self._selected_obj.name - return Series(output, index=index, name=name) + result = Series(indexed_output[0], index=index, name=columns[0]) + + return result + + def _wrap_aggregated_output( + self, output: Mapping[base.OutputKey, Union[Series, np.ndarray]] + ) -> Union[Series, DataFrame]: + """ + Wraps the output of a SeriesGroupBy aggregation into the expected result. - def _wrap_aggregated_output(self, output, names=None): + Parameters + ---------- + output : Mapping[base.OutputKey, Union[Series, np.ndarray]] + Data to wrap. + + Returns + ------- + Series or DataFrame + + Notes + ----- + In the vast majority of cases output will only contain one element. + The exception is operations that expand dimensions, like ohlc. + """ result = self._wrap_series_output( - output=output, index=self.grouper.result_index, names=names + output=output, index=self.grouper.result_index ) return self._reindex_output(result)._convert(datetime=True) - def _wrap_transformed_output(self, output, names=None): - return self._wrap_series_output( - output=output, index=self.obj.index, names=names - ) + def _wrap_transformed_output( + self, output: Mapping[base.OutputKey, Union[Series, np.ndarray]] + ) -> Series: + """ + Wraps the output of a SeriesGroupBy aggregation into the expected result. + + Parameters + ---------- + output : dict[base.OutputKey, Union[Series, np.ndarray]] + Dict with a sole key of 0 and a value of the result values. + + Returns + ------- + Series + + Notes + ----- + output should always contain one element. It is specified as a dict + for consistency with DataFrame methods and _wrap_aggregated_output. + """ + assert len(output) == 1 + result = self._wrap_series_output(output=output, index=self.obj.index) + + # No transformations increase the ndim of the result + assert isinstance(result, Series) + return result def _wrap_applied_output(self, keys, values, not_indexed_same=False): if len(keys) == 0: @@ -1084,17 +1157,6 @@ def _aggregate_item_by_item(self, func, *args, **kwargs) -> DataFrame: return DataFrame(result, columns=result_columns) - def _decide_output_index(self, output, labels): - if len(output) == len(labels): - output_keys = labels - else: - output_keys = sorted(output) - - if isinstance(labels, MultiIndex): - output_keys = MultiIndex.from_tuples(output_keys, names=labels.names) - - return output_keys - def _wrap_applied_output(self, keys, values, not_indexed_same=False): if len(keys) == 0: return DataFrame(index=keys) @@ -1561,27 +1623,62 @@ def _insert_inaxis_grouper_inplace(self, result): if in_axis: result.insert(0, name, lev) - def _wrap_aggregated_output(self, output, names=None): - agg_axis = 0 if self.axis == 1 else 1 - agg_labels = self._obj_with_exclusions._get_axis(agg_axis) + def _wrap_aggregated_output( + self, output: Mapping[base.OutputKey, Union[Series, np.ndarray]] + ) -> DataFrame: + """ + Wraps the output of DataFrameGroupBy aggregations into the expected result. - output_keys = self._decide_output_index(output, agg_labels) + Parameters + ---------- + output : Mapping[base.OutputKey, Union[Series, np.ndarray]] + Data to wrap. + + Returns + ------- + DataFrame + """ + indexed_output = {key.position: val for key, val in output.items()} + columns = Index(key.label for key in output) + + result = DataFrame(indexed_output) + result.columns = columns if not self.as_index: - result = DataFrame(output, columns=output_keys) self._insert_inaxis_grouper_inplace(result) result = result._consolidate() else: index = self.grouper.result_index - result = DataFrame(output, index=index, columns=output_keys) + result.index = index if self.axis == 1: result = result.T return self._reindex_output(result)._convert(datetime=True) - def _wrap_transformed_output(self, output, names=None) -> DataFrame: - return DataFrame(output, index=self.obj.index) + def _wrap_transformed_output( + self, output: Mapping[base.OutputKey, Union[Series, np.ndarray]] + ) -> DataFrame: + """ + Wraps the output of DataFrameGroupBy transformations into the expected result. + + Parameters + ---------- + output : Mapping[base.OutputKey, Union[Series, np.ndarray]] + Data to wrap. + + Returns + ------- + DataFrame + """ + indexed_output = {key.position: val for key, val in output.items()} + columns = Index(key.label for key in output) + + result = DataFrame(indexed_output) + result.columns = columns + result.index = self.obj.index + + return result def _wrap_agged_blocks(self, items, blocks): if not self.as_index: @@ -1701,9 +1798,11 @@ def groupby_series(obj, col=None): if isinstance(obj, Series): results = groupby_series(obj) else: + # TODO: this is duplicative of how GroupBy naturally works + # Try to consolidate with normal wrapping functions from pandas.core.reshape.concat import concat - results = [groupby_series(obj[col], col) for col in obj.columns] + results = [groupby_series(content, label) for label, content in obj.items()] results = concat(results, axis=1) results.columns.names = obj.columns.names @@ -1745,7 +1844,7 @@ def _normalize_keyword_aggregation(kwargs): """ Normalize user-provided "named aggregation" kwargs. - Transforms from the new ``Dict[str, NamedAgg]`` style kwargs + Transforms from the new ``Mapping[str, NamedAgg]`` style kwargs to the old OrderedDict[str, List[scalar]]]. Parameters @@ -1766,7 +1865,7 @@ def _normalize_keyword_aggregation(kwargs): >>> _normalize_keyword_aggregation({'output': ('input', 'sum')}) (OrderedDict([('input', ['sum'])]), ('output',), [('input', 'sum')]) """ - # Normalize the aggregation functions as Dict[column, List[func]], + # Normalize the aggregation functions as Mapping[column, List[func]], # process normally, then fixup the names. # TODO(Py35): When we drop python 3.5, change this to # defaultdict(list) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 75bb818eacf4b..f7282950498c5 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -7,14 +7,23 @@ class providing the base-class of operations. expose these user-facing objects to provide specific functionailty. """ -import collections from contextlib import contextmanager import datetime from functools import partial, wraps import inspect import re import types -from typing import FrozenSet, Iterable, List, Optional, Tuple, Type, Union +from typing import ( + Dict, + FrozenSet, + Iterable, + List, + Mapping, + Optional, + Tuple, + Type, + Union, +) import numpy as np @@ -42,7 +51,7 @@ class providing the base-class of operations. from pandas._typing import FrameOrSeries, Scalar from pandas.core import nanops import pandas.core.algorithms as algorithms -from pandas.core.arrays import Categorical, try_cast_to_ea +from pandas.core.arrays import Categorical, DatetimeArray, try_cast_to_ea from pandas.core.base import DataError, PandasObject, SelectionMixin import pandas.core.common as com from pandas.core.frame import DataFrame @@ -820,31 +829,33 @@ def _transform_should_cast(self, func_nm: str) -> bool: ) def _cython_transform(self, how: str, numeric_only: bool = True, **kwargs): - output = collections.OrderedDict() # type: dict - for obj in self._iterate_slices(): + output: Dict[base.OutputKey, np.ndarray] = {} + for idx, obj in enumerate(self._iterate_slices()): name = obj.name is_numeric = is_numeric_dtype(obj.dtype) if numeric_only and not is_numeric: continue try: - result, names = self.grouper.transform(obj.values, how, **kwargs) + result, _ = self.grouper.transform(obj.values, how, **kwargs) except NotImplementedError: continue + if self._transform_should_cast(how): - output[name] = self._try_cast(result, obj) - else: - output[name] = result + result = self._try_cast(result, obj) + + key = base.OutputKey(label=name, position=idx) + output[key] = result if len(output) == 0: raise DataError("No numeric types to aggregate") - return self._wrap_transformed_output(output, names) + return self._wrap_transformed_output(output) - def _wrap_aggregated_output(self, output, names=None): + def _wrap_aggregated_output(self, output: Mapping[base.OutputKey, np.ndarray]): raise AbstractMethodError(self) - def _wrap_transformed_output(self, output, names=None): + def _wrap_transformed_output(self, output: Mapping[base.OutputKey, np.ndarray]): raise AbstractMethodError(self) def _wrap_applied_output(self, keys, values, not_indexed_same: bool = False): @@ -853,30 +864,48 @@ def _wrap_applied_output(self, keys, values, not_indexed_same: bool = False): def _cython_agg_general( self, how: str, alt=None, numeric_only: bool = True, min_count: int = -1 ): - output = {} + output: Dict[base.OutputKey, Union[np.ndarray, DatetimeArray]] = {} + # Ideally we would be able to enumerate self._iterate_slices and use + # the index from enumeration as the key of output, but ohlc in particular + # returns a (n x 4) array. Output requires 1D ndarrays as values, so we + # need to slice that up into 1D arrays + idx = 0 for obj in self._iterate_slices(): name = obj.name is_numeric = is_numeric_dtype(obj.dtype) if numeric_only and not is_numeric: continue - result, names = self.grouper.aggregate( + result, agg_names = self.grouper.aggregate( obj._values, how, min_count=min_count ) - output[name] = self._try_cast(result, obj) + + if agg_names: + # e.g. ohlc + assert len(agg_names) == result.shape[1] + for result_column, result_name in zip(result.T, agg_names): + key = base.OutputKey(label=result_name, position=idx) + output[key] = self._try_cast(result_column, obj) + idx += 1 + else: + assert result.ndim == 1 + key = base.OutputKey(label=name, position=idx) + output[key] = self._try_cast(result, obj) + idx += 1 if len(output) == 0: raise DataError("No numeric types to aggregate") - return self._wrap_aggregated_output(output, names) + return self._wrap_aggregated_output(output) def _python_agg_general(self, func, *args, **kwargs): func = self._is_builtin_func(func) f = lambda x: func(x, *args, **kwargs) # iterate through "columns" ex exclusions to populate output dict - output = {} - for obj in self._iterate_slices(): + output: Dict[base.OutputKey, np.ndarray] = {} + + for idx, obj in enumerate(self._iterate_slices()): name = obj.name if self.grouper.ngroups == 0: # agg_series below assumes ngroups > 0 @@ -896,7 +925,8 @@ def _python_agg_general(self, func, *args, **kwargs): result, counts = self.grouper.agg_series(obj, f) assert result is not None - output[name] = self._try_cast(result, obj, numeric_only=True) + key = base.OutputKey(label=name, position=idx) + output[key] = self._try_cast(result, obj, numeric_only=True) if len(output) == 0: return self._python_apply_general(f) @@ -904,14 +934,14 @@ def _python_agg_general(self, func, *args, **kwargs): if self.grouper._filter_empty_groups: mask = counts.ravel() > 0 - for name, result in output.items(): + for key, result in output.items(): # since we are masking, make sure that we have a float object values = result if is_numeric_dtype(values.dtype): values = ensure_float(values) - output[name] = self._try_cast(values[mask], result) + output[key] = self._try_cast(values[mask], result) return self._wrap_aggregated_output(output) @@ -2223,10 +2253,10 @@ def _get_cythonized_result( grouper = self.grouper labels, _, ngroups = grouper.group_info - output = collections.OrderedDict() # type: dict + output: Dict[base.OutputKey, np.ndarray] = {} base_func = getattr(libgroupby, how) - for obj in self._iterate_slices(): + for idx, obj in enumerate(self._iterate_slices()): name = obj.name values = obj._data._values @@ -2260,7 +2290,8 @@ def _get_cythonized_result( if post_processing: result = post_processing(result, inferences) - output[name] = result + key = base.OutputKey(label=name, position=idx) + output[key] = result if aggregate: return self._wrap_aggregated_output(output) diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index 47ca2b2190ecf..7fd9fb8f53134 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -424,8 +424,15 @@ def _get_cython_func_and_vals( return func, values def _cython_operation( - self, kind: str, values, how: str, axis: int, min_count: int = -1, **kwargs - ): + self, kind: str, values, how: str, axis, min_count: int = -1, **kwargs + ) -> Tuple[np.ndarray, Optional[List[str]]]: + """ + Returns the values of a cython operation as a Tuple of [data, names]. + + Names is only useful when dealing with 2D results, like ohlc + (see self._name_functions). + """ + assert kind in ["transform", "aggregate"] orig_values = values diff --git a/pandas/tests/groupby/conftest.py b/pandas/tests/groupby/conftest.py index af98f9efe2af9..5b8cc86513954 100644 --- a/pandas/tests/groupby/conftest.py +++ b/pandas/tests/groupby/conftest.py @@ -2,7 +2,7 @@ import pytest from pandas import DataFrame, MultiIndex -from pandas.core.groupby.base import reduction_kernels +from pandas.core.groupby.base import reduction_kernels, transformation_kernels import pandas.util.testing as tm @@ -110,3 +110,15 @@ def reduction_func(request): """yields the string names of all groupby reduction functions, one at a time. """ return request.param + + +@pytest.fixture(params=transformation_kernels) +def transformation_func(request): + """yields the string names of all groupby transformation functions.""" + return request.param + + +@pytest.fixture(params=sorted(reduction_kernels) + sorted(transformation_kernels)) +def groupby_func(request): + """yields both aggregation and transformation functions.""" + return request.param diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 0d68ff36dfa20..b848e9caad9be 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -1951,3 +1951,39 @@ def test_groupby_only_none_group(): expected = pd.Series([np.nan], name="x") tm.assert_series_equal(actual, expected) + + +@pytest.mark.parametrize("bool_agg_func", ["any", "all"]) +def test_bool_aggs_dup_column_labels(bool_agg_func): + # 21668 + df = pd.DataFrame([[True, True]], columns=["a", "a"]) + grp_by = df.groupby([0]) + result = getattr(grp_by, bool_agg_func)() + + expected = df + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize( + "idx", [pd.Index(["a", "a"]), pd.MultiIndex.from_tuples((("a", "a"), ("a", "a")))] +) +def test_dup_labels_output_shape(groupby_func, idx): + if groupby_func in {"size", "ngroup", "cumcount"}: + pytest.skip("Not applicable") + + df = pd.DataFrame([[1, 1]], columns=idx) + grp_by = df.groupby([0]) + + args = [] + if groupby_func in {"fillna", "nth"}: + args.append(0) + elif groupby_func == "corrwith": + args.append(df) + elif groupby_func == "tshift": + df.index = [pd.Timestamp("today")] + args.extend([1, "D"]) + + result = getattr(grp_by, groupby_func)(*args) + + assert result.shape == (1, 2) + tm.assert_index_equal(result.columns, idx) From 002a89cefada8894adf9717e83bde663ad1a54aa Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Wed, 20 Nov 2019 05:02:06 -0800 Subject: [PATCH 131/185] CLN: reshape (#29627) --- pandas/core/reshape/concat.py | 37 +++++--------------------- pandas/core/reshape/melt.py | 18 ++++++++----- pandas/core/reshape/merge.py | 32 +++++++++++----------- pandas/core/reshape/pivot.py | 50 ++++++++++++++++++++++++----------- pandas/core/reshape/tile.py | 6 ++--- 5 files changed, 72 insertions(+), 71 deletions(-) diff --git a/pandas/core/reshape/concat.py b/pandas/core/reshape/concat.py index 3e8d19096a36e..c2322ae626cfd 100644 --- a/pandas/core/reshape/concat.py +++ b/pandas/core/reshape/concat.py @@ -2,6 +2,7 @@ concat routines """ +from typing import List import warnings import numpy as np @@ -437,13 +438,13 @@ def get_result(self): mgr = self.objs[0]._data.concat( [x._data for x in self.objs], self.new_axes ) - cons = _get_series_result_type(mgr, self.objs) + cons = self.objs[0]._constructor return cons(mgr, name=name).__finalize__(self, method="concat") # combine as columns in a frame else: data = dict(zip(range(len(self.objs)), self.objs)) - cons = _get_series_result_type(data) + cons = DataFrame index, columns = self.new_axes df = cons(data, index=index) @@ -473,7 +474,7 @@ def get_result(self): if not self.copy: new_data._consolidate_inplace() - cons = _get_frame_result_type(new_data, self.objs) + cons = self.objs[0]._constructor return cons._from_axes(new_data, self.new_axes).__finalize__( self, method="concat" ) @@ -520,13 +521,13 @@ def _get_new_axes(self): new_axes[self.axis] = self._get_concat_axis() return new_axes - def _get_comb_axis(self, i): + def _get_comb_axis(self, i: int) -> Index: data_axis = self.objs[0]._get_block_manager_axis(i) return get_objs_combined_axis( self.objs, axis=data_axis, intersect=self.intersect, sort=self.sort ) - def _get_concat_axis(self): + def _get_concat_axis(self) -> Index: """ Return index to be used along concatenation axis. """ @@ -537,7 +538,7 @@ def _get_concat_axis(self): idx = ibase.default_index(len(self.objs)) return idx elif self.keys is None: - names = [None] * len(self.objs) + names: List = [None] * len(self.objs) num = 0 has_names = False for i, x in enumerate(self.objs): @@ -702,27 +703,3 @@ def _make_concat_multiindex(indexes, keys, levels=None, names=None) -> MultiInde return MultiIndex( levels=new_levels, codes=new_codes, names=new_names, verify_integrity=False ) - - -def _get_series_result_type(result, objs=None): - """ - return appropriate class of Series concat - input is either dict or array-like - """ - # TODO: See if we can just inline with _constructor_expanddim - # now that sparse is removed. - - # concat Series with axis 1 - if isinstance(result, dict): - return DataFrame - - # otherwise it is a SingleBlockManager (axis = 0) - return objs[0]._constructor - - -def _get_frame_result_type(result, objs): - """ - return appropriate class of DataFrame-like concat - """ - # TODO: just inline this as _constructor. - return objs[0] diff --git a/pandas/core/reshape/melt.py b/pandas/core/reshape/melt.py index 16c04454898db..4cba52c5cd651 100644 --- a/pandas/core/reshape/melt.py +++ b/pandas/core/reshape/melt.py @@ -1,4 +1,5 @@ import re +from typing import List import numpy as np @@ -10,7 +11,7 @@ from pandas.core.dtypes.missing import notna from pandas.core.arrays import Categorical -from pandas.core.frame import _shared_docs +from pandas.core.frame import DataFrame, _shared_docs from pandas.core.indexes.base import Index from pandas.core.reshape.concat import concat from pandas.core.tools.numeric import to_numeric @@ -21,13 +22,13 @@ % dict(caller="pd.melt(df, ", versionadded="", other="DataFrame.melt") ) def melt( - frame, + frame: DataFrame, id_vars=None, value_vars=None, var_name=None, value_name="value", col_level=None, -): +) -> DataFrame: # TODO: what about the existing index? # If multiindex, gather names of columns on all level for checking presence # of `id_vars` and `value_vars` @@ -35,6 +36,7 @@ def melt( cols = [x for c in frame.columns for x in c] else: cols = list(frame.columns) + if id_vars is not None: if not is_list_like(id_vars): id_vars = [id_vars] @@ -119,7 +121,7 @@ def melt( return frame._constructor(mdata, columns=mcolumns) -def lreshape(data, groups, dropna=True, label=None): +def lreshape(data: DataFrame, groups, dropna: bool = True, label=None) -> DataFrame: """ Reshape long-format data to wide. Generalized inverse of DataFrame.pivot @@ -129,6 +131,8 @@ def lreshape(data, groups, dropna=True, label=None): groups : dict {new_name : list_of_columns} dropna : boolean, default True + label : object, default None + Dummy kwarg, not used. Examples -------- @@ -188,7 +192,7 @@ def lreshape(data, groups, dropna=True, label=None): return data._constructor(mdata, columns=id_cols + pivot_cols) -def wide_to_long(df, stubnames, i, j, sep: str = "", suffix: str = r"\d+"): +def wide_to_long(df: DataFrame, stubnames, i, j, sep: str = "", suffix: str = r"\d+"): r""" Wide panel to long format. Less flexible but more user-friendly than melt. @@ -412,14 +416,14 @@ def wide_to_long(df, stubnames, i, j, sep: str = "", suffix: str = r"\d+"): two 2.9 """ - def get_var_names(df, stub, sep, suffix): + def get_var_names(df, stub: str, sep: str, suffix: str) -> List[str]: regex = r"^{stub}{sep}{suffix}$".format( stub=re.escape(stub), sep=re.escape(sep), suffix=suffix ) pattern = re.compile(regex) return [col for col in df.columns if pattern.match(col)] - def melt_stub(df, stub, i, j, value_vars, sep: str): + def melt_stub(df, stub: str, i, j, value_vars, sep: str): newdf = melt( df, id_vars=i, diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index 76c4b328eb4db..4d838db6c95f6 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -92,7 +92,7 @@ def merge( def _groupby_and_merge( - by, on, left, right, _merge_pieces, check_duplicates: bool = True + by, on, left, right: "DataFrame", _merge_pieces, check_duplicates: bool = True ): """ groupby & merge; we are always performing a left-by type operation @@ -313,7 +313,7 @@ def merge_asof( suffixes=("_x", "_y"), tolerance=None, allow_exact_matches: bool = True, - direction="backward", + direction: str = "backward", ): """ Perform an asof merge. This is similar to a left-join except that we @@ -1299,11 +1299,13 @@ def _get_join_indexers( right_keys ), "left_key and right_keys must be the same length" - # bind `sort` arg. of _factorize_keys - fkeys = partial(_factorize_keys, sort=sort) - # get left & right join labels and num. of levels at each location - llab, rlab, shape = map(list, zip(*map(fkeys, left_keys, right_keys))) + mapped = ( + _factorize_keys(left_keys[n], right_keys[n], sort=sort) + for n in range(len(left_keys)) + ) + zipped = zip(*mapped) + llab, rlab, shape = [list(x) for x in zipped] # get flat i8 keys from label lists lkey, rkey = _get_join_keys(llab, rlab, shape, sort) @@ -1311,7 +1313,7 @@ def _get_join_indexers( # factorize keys to a dense i8 space # `count` is the num. of unique keys # set(lkey) | set(rkey) == range(count) - lkey, rkey, count = fkeys(lkey, rkey) + lkey, rkey, count = _factorize_keys(lkey, rkey, sort=sort) # preserve left frame order if how == 'left' and sort == False kwargs = copy.copy(kwargs) @@ -1487,12 +1489,12 @@ def get_result(self): return result -def _asof_function(direction): +def _asof_function(direction: str): name = "asof_join_{dir}".format(dir=direction) return getattr(libjoin, name, None) -def _asof_by_function(direction): +def _asof_by_function(direction: str): name = "asof_join_{dir}_on_X_by_Y".format(dir=direction) return getattr(libjoin, name, None) @@ -1536,7 +1538,7 @@ def __init__( how: str = "asof", tolerance=None, allow_exact_matches: bool = True, - direction="backward", + direction: str = "backward", ): self.by = by @@ -1775,11 +1777,11 @@ def flip(xs): def _get_multiindex_indexer(join_keys, index: MultiIndex, sort: bool): - # bind `sort` argument - fkeys = partial(_factorize_keys, sort=sort) - # left & right join labels and num. of levels at each location - mapped = (fkeys(index.levels[n], join_keys[n]) for n in range(len(index.levels))) + mapped = ( + _factorize_keys(index.levels[n], join_keys[n], sort=sort) + for n in range(index.nlevels) + ) zipped = zip(*mapped) rcodes, lcodes, shape = [list(x) for x in zipped] if sort: @@ -1804,7 +1806,7 @@ def _get_multiindex_indexer(join_keys, index: MultiIndex, sort: bool): lkey, rkey = _get_join_keys(lcodes, rcodes, shape, sort) # factorize keys to a dense i8 space - lkey, rkey, count = fkeys(lkey, rkey) + lkey, rkey, count = _factorize_keys(lkey, rkey, sort=sort) return libjoin.left_outer_join(lkey, rkey, count, sort=sort) diff --git a/pandas/core/reshape/pivot.py b/pandas/core/reshape/pivot.py index 0626420d9c114..b126b6e221ccc 100644 --- a/pandas/core/reshape/pivot.py +++ b/pandas/core/reshape/pivot.py @@ -1,3 +1,5 @@ +from typing import TYPE_CHECKING, Callable, Dict, Tuple, Union + import numpy as np from pandas.util._decorators import Appender, Substitution @@ -14,6 +16,9 @@ from pandas.core.reshape.util import cartesian_product from pandas.core.series import Series +if TYPE_CHECKING: + from pandas import DataFrame + # Note: We need to make sure `frame` is imported before `pivot`, otherwise # _shared_docs['pivot_table'] will not yet exist. TODO: Fix this dependency @@ -180,14 +185,14 @@ def pivot_table( def _add_margins( - table, + table: Union["Series", "DataFrame"], data, values, rows, cols, aggfunc, observed=None, - margins_name="All", + margins_name: str = "All", fill_value=None, ): if not isinstance(margins_name, str): @@ -200,14 +205,16 @@ def _add_margins( grand_margin = _compute_grand_margin(data, values, aggfunc, margins_name) - # could be passed a Series object with no 'columns' - if hasattr(table, "columns"): + if table.ndim == 2: + # i.e. DataFramae for level in table.columns.names[1:]: if margins_name in table.columns.get_level_values(level): raise ValueError(msg) if len(rows) > 1: - key = (margins_name,) + ("",) * (len(rows) - 1) + key = (margins_name,) + ("",) * ( + len(rows) - 1 + ) # type: Union[str, Tuple[str, ...]] else: key = margins_name @@ -216,7 +223,7 @@ def _add_margins( # one column in the data. Compute grand margin and return it. return table.append(Series({key: grand_margin[margins_name]})) - if values: + elif values: marginal_result_set = _generate_marginal_results( table, data, @@ -232,12 +239,15 @@ def _add_margins( return marginal_result_set result, margin_keys, row_margin = marginal_result_set else: + # no values, and table is a DataFrame + assert isinstance(table, ABCDataFrame) marginal_result_set = _generate_marginal_results_without_values( table, data, rows, cols, aggfunc, observed, margins_name ) if not isinstance(marginal_result_set, tuple): return marginal_result_set result, margin_keys, row_margin = marginal_result_set + row_margin = row_margin.reindex(result.columns, fill_value=fill_value) # populate grand margin for k in margin_keys: @@ -266,7 +276,7 @@ def _add_margins( return result -def _compute_grand_margin(data, values, aggfunc, margins_name="All"): +def _compute_grand_margin(data, values, aggfunc, margins_name: str = "All"): if values: grand_margin = {} @@ -289,7 +299,15 @@ def _compute_grand_margin(data, values, aggfunc, margins_name="All"): def _generate_marginal_results( - table, data, values, rows, cols, aggfunc, observed, grand_margin, margins_name="All" + table, + data, + values, + rows, + cols, + aggfunc, + observed, + grand_margin, + margins_name: str = "All", ): if len(cols) > 0: # need to "interleave" the margins @@ -353,7 +371,7 @@ def _all_key(key): def _generate_marginal_results_without_values( - table, data, rows, cols, aggfunc, observed, margins_name="All" + table: "DataFrame", data, rows, cols, aggfunc, observed, margins_name: str = "All" ): if len(cols) > 0: # need to "interleave" the margins @@ -406,7 +424,7 @@ def _convert_by(by): @Substitution("\ndata : DataFrame") @Appender(_shared_docs["pivot"], indents=1) -def pivot(data, index=None, columns=None, values=None): +def pivot(data: "DataFrame", index=None, columns=None, values=None): if values is None: cols = [columns] if index is None else [index, columns] append = index is None @@ -436,8 +454,8 @@ def crosstab( colnames=None, aggfunc=None, margins=False, - margins_name="All", - dropna=True, + margins_name: str = "All", + dropna: bool = True, normalize=False, ): """ @@ -546,7 +564,7 @@ def crosstab( if pass_objs: common_idx = get_objs_combined_axis(pass_objs, intersect=True, sort=False) - data = {} + data = {} # type: dict data.update(zip(rownames, index)) data.update(zip(colnames, columns)) @@ -585,7 +603,7 @@ def crosstab( return table -def _normalize(table, normalize, margins, margins_name="All"): +def _normalize(table, normalize, margins: bool, margins_name="All"): if not isinstance(normalize, (bool, str)): axis_subs = {0: "index", 1: "columns"} @@ -601,7 +619,7 @@ def _normalize(table, normalize, margins, margins_name="All"): "all": lambda x: x / x.sum(axis=1).sum(axis=0), "columns": lambda x: x / x.sum(), "index": lambda x: x.div(x.sum(axis=1), axis=0), - } + } # type: Dict[Union[bool, str], Callable] normalizers[True] = normalizers["all"] @@ -668,7 +686,7 @@ def _normalize(table, normalize, margins, margins_name="All"): return table -def _get_names(arrs, names, prefix="row"): +def _get_names(arrs, names, prefix: str = "row"): if names is None: names = [] for i, arr in enumerate(arrs): diff --git a/pandas/core/reshape/tile.py b/pandas/core/reshape/tile.py index 073bb4707f890..bfaa49dd576dc 100644 --- a/pandas/core/reshape/tile.py +++ b/pandas/core/reshape/tile.py @@ -496,7 +496,7 @@ def _convert_bin_to_datelike_type(bins, dtype): def _format_labels( - bins, precision, right: bool = True, include_lowest: bool = False, dtype=None + bins, precision: int, right: bool = True, include_lowest: bool = False, dtype=None ): """ based on the dtype, return our labels """ @@ -565,7 +565,7 @@ def _postprocess_for_cut(fac, bins, retbins: bool, dtype, original): return fac, bins -def _round_frac(x, precision): +def _round_frac(x, precision: int): """ Round the fractional part of the given number """ @@ -580,7 +580,7 @@ def _round_frac(x, precision): return np.around(x, digits) -def _infer_precision(base_precision, bins): +def _infer_precision(base_precision: int, bins) -> int: """Infer an appropriate precision for _round_frac """ for precision in range(base_precision, 20): From 6bfd03ed62960d25f22782cd37d22b9531890440 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Wed, 20 Nov 2019 05:12:12 -0800 Subject: [PATCH 132/185] TYP: more annotations for io.pytables (#29703) --- pandas/io/pytables.py | 131 ++++++++++++++++++++++++++++++------------ 1 file changed, 94 insertions(+), 37 deletions(-) diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index 193b8f5053d65..9589832095474 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -520,16 +520,16 @@ def root(self): def filename(self): return self._path - def __getitem__(self, key): + def __getitem__(self, key: str): return self.get(key) - def __setitem__(self, key, value): + def __setitem__(self, key: str, value): self.put(key, value) - def __delitem__(self, key): + def __delitem__(self, key: str): return self.remove(key) - def __getattr__(self, name): + def __getattr__(self, name: str): """ allow attribute access to get stores """ try: return self.get(name) @@ -791,7 +791,12 @@ def func(_start, _stop, _where): return it.get_result() def select_as_coordinates( - self, key: str, where=None, start=None, stop=None, **kwargs + self, + key: str, + where=None, + start: Optional[int] = None, + stop: Optional[int] = None, + **kwargs, ): """ return the selection as an Index @@ -943,13 +948,13 @@ def func(_start, _stop, _where): return it.get_result(coordinates=True) - def put(self, key, value, format=None, append=False, **kwargs): + def put(self, key: str, value, format=None, append=False, **kwargs): """ Store object in HDFStore. Parameters ---------- - key : object + key : str value : {Series, DataFrame} format : 'fixed(f)|table(t)', default is 'fixed' fixed(f) : Fixed format @@ -1028,7 +1033,14 @@ def remove(self, key: str, where=None, start=None, stop=None): return s.delete(where=where, start=start, stop=stop) def append( - self, key, value, format=None, append=True, columns=None, dropna=None, **kwargs + self, + key: str, + value, + format=None, + append=True, + columns=None, + dropna=None, + **kwargs, ): """ Append to Table in file. Node must already exist and be Table @@ -1036,7 +1048,7 @@ def append( Parameters ---------- - key : object + key : str value : {Series, DataFrame} format : 'table' is the default table(t) : table format @@ -1077,7 +1089,14 @@ def append( self._write_to_group(key, value, append=append, dropna=dropna, **kwargs) def append_to_multiple( - self, d, value, selector, data_columns=None, axes=None, dropna=False, **kwargs + self, + d: Dict, + value, + selector, + data_columns=None, + axes=None, + dropna=False, + **kwargs, ): """ Append to multiple tables @@ -1123,7 +1142,7 @@ def append_to_multiple( # figure out how to split the value remain_key = None - remain_values = [] + remain_values: List = [] for k, v in d.items(): if v is None: if remain_key is not None: @@ -1871,7 +1890,7 @@ def validate(self, handler, append): def validate_names(self): pass - def validate_and_set(self, handler, append): + def validate_and_set(self, handler: "AppendableTable", append: bool): self.set_table(handler.table) self.validate_col() self.validate_attr(append) @@ -1901,7 +1920,7 @@ def validate_col(self, itemsize=None): return None - def validate_attr(self, append): + def validate_attr(self, append: bool): # check for backwards incompatibility if append: existing_kind = getattr(self.attrs, self.kind_attr, None) @@ -1967,7 +1986,7 @@ def read_metadata(self, handler): """ retrieve the metadata for this columns """ self.metadata = handler.read_metadata(self.cname) - def validate_metadata(self, handler): + def validate_metadata(self, handler: "AppendableTable"): """ validate that kind=category does not change the categories """ if self.meta == "category": new_metadata = self.metadata @@ -1982,7 +2001,7 @@ def validate_metadata(self, handler): "different categories to the existing" ) - def write_metadata(self, handler): + def write_metadata(self, handler: "AppendableTable"): """ set the meta data """ if self.metadata is not None: handler.write_metadata(self.cname, self.metadata) @@ -1995,7 +2014,15 @@ class GenericIndexCol(IndexCol): def is_indexed(self) -> bool: return False - def convert(self, values, nan_rep, encoding, errors, start=None, stop=None): + def convert( + self, + values, + nan_rep, + encoding, + errors, + start: Optional[int] = None, + stop: Optional[int] = None, + ): """ set the values from this selection: take = take ownership Parameters @@ -2012,9 +2039,9 @@ def convert(self, values, nan_rep, encoding, errors, start=None, stop=None): the underlying table's row count are normalized to that. """ - start = start if start is not None else 0 - stop = min(stop, self.table.nrows) if stop is not None else self.table.nrows - self.values = Int64Index(np.arange(stop - start)) + _start = start if start is not None else 0 + _stop = min(stop, self.table.nrows) if stop is not None else self.table.nrows + self.values = Int64Index(np.arange(_stop - _start)) return self @@ -2749,7 +2776,9 @@ def get_attrs(self): def write(self, obj, **kwargs): self.set_attrs() - def read_array(self, key: str, start=None, stop=None): + def read_array( + self, key: str, start: Optional[int] = None, stop: Optional[int] = None + ): """ read an array for the specified node (off of group """ import tables @@ -2836,7 +2865,7 @@ def write_block_index(self, key, index): self.write_array("{key}_blengths".format(key=key), index.blengths) setattr(self.attrs, "{key}_length".format(key=key), index.length) - def read_block_index(self, key, **kwargs): + def read_block_index(self, key, **kwargs) -> BlockIndex: length = getattr(self.attrs, "{key}_length".format(key=key)) blocs = self.read_array("{key}_blocs".format(key=key), **kwargs) blengths = self.read_array("{key}_blengths".format(key=key), **kwargs) @@ -2846,7 +2875,7 @@ def write_sparse_intindex(self, key, index): self.write_array("{key}_indices".format(key=key), index.indices) setattr(self.attrs, "{key}_length".format(key=key), index.length) - def read_sparse_intindex(self, key, **kwargs): + def read_sparse_intindex(self, key, **kwargs) -> IntIndex: length = getattr(self.attrs, "{key}_length".format(key=key)) indices = self.read_array("{key}_indices".format(key=key), **kwargs) return IntIndex(length, indices) @@ -2878,7 +2907,7 @@ def write_multi_index(self, key, index): label_key = "{key}_label{idx}".format(key=key, idx=i) self.write_array(label_key, level_codes) - def read_multi_index(self, key, **kwargs): + def read_multi_index(self, key, **kwargs) -> MultiIndex: nlevels = getattr(self.attrs, "{key}_nlevels".format(key=key)) levels = [] @@ -2898,7 +2927,9 @@ def read_multi_index(self, key, **kwargs): levels=levels, codes=codes, names=names, verify_integrity=True ) - def read_index_node(self, node, start=None, stop=None): + def read_index_node( + self, node, start: Optional[int] = None, stop: Optional[int] = None + ): data = node[start:stop] # If the index was an empty array write_array_empty() will # have written a sentinel. Here we relace it with the original. @@ -2953,7 +2984,7 @@ def read_index_node(self, node, start=None, stop=None): return name, index - def write_array_empty(self, key, value): + def write_array_empty(self, key: str, value): """ write a 0-len array """ # ugly hack for length 0 axes @@ -2966,7 +2997,7 @@ def _is_empty_array(self, shape) -> bool: """Returns true if any axis is zero length.""" return any(x == 0 for x in shape) - def write_array(self, key, value, items=None): + def write_array(self, key: str, value, items=None): if key in self.group: self._handle.remove_node(self.group, key) @@ -3052,7 +3083,9 @@ def write_array(self, key, value, items=None): class LegacyFixed(GenericFixed): - def read_index_legacy(self, key, start=None, stop=None): + def read_index_legacy( + self, key: str, start: Optional[int] = None, stop: Optional[int] = None + ): node = getattr(self.group, key) data = node[start:stop] kind = node._v_attrs.kind @@ -3237,7 +3270,7 @@ def __init__(self, *args, **kwargs): self.selection = None @property - def table_type_short(self): + def table_type_short(self) -> str: return self.table_type.split("_")[0] @property @@ -3311,7 +3344,7 @@ def validate(self, other): ) @property - def is_multi_index(self): + def is_multi_index(self) -> bool: """the levels attribute is 1 or a list in the case of a multi-index""" return isinstance(self.levels, list) @@ -3335,7 +3368,7 @@ def validate_multiindex(self, obj): ) @property - def nrows_expected(self): + def nrows_expected(self) -> int: """ based on our axes, compute the expected nrows """ return np.prod([i.cvalues.shape[0] for i in self.index_axes]) @@ -3691,7 +3724,7 @@ def create_axes( self, axes, obj, - validate=True, + validate: bool = True, nan_rep=None, data_columns=None, min_itemsize=None, @@ -4000,7 +4033,13 @@ def create_description( return d - def read_coordinates(self, where=None, start=None, stop=None, **kwargs): + def read_coordinates( + self, + where=None, + start: Optional[int] = None, + stop: Optional[int] = None, + **kwargs, + ): """select coordinates (row numbers) from a table; return the coordinates object """ @@ -4013,7 +4052,7 @@ def read_coordinates(self, where=None, start=None, stop=None, **kwargs): return False # create the selection - self.selection = Selection(self, where=where, start=start, stop=stop, **kwargs) + self.selection = Selection(self, where=where, start=start, stop=stop) coords = self.selection.select_coords() if self.selection.filter is not None: for field, op, filt in self.selection.filter.format(): @@ -4024,7 +4063,13 @@ def read_coordinates(self, where=None, start=None, stop=None, **kwargs): return Index(coords) - def read_column(self, column: str, where=None, start=None, stop=None): + def read_column( + self, + column: str, + where=None, + start: Optional[int] = None, + stop: Optional[int] = None, + ): """return a single column from the table, generally only indexables are interesting """ @@ -4302,7 +4347,13 @@ def write_data_chunk(self, rows, indexes, mask, values): "tables cannot write this data -> {detail}".format(detail=detail) ) - def delete(self, where=None, start=None, stop=None, **kwargs): + def delete( + self, + where=None, + start: Optional[int] = None, + stop: Optional[int] = None, + **kwargs, + ): # delete all rows (and return the nrows) if where is None or not len(where): @@ -4323,7 +4374,7 @@ def delete(self, where=None, start=None, stop=None, **kwargs): # create the selection table = self.table - self.selection = Selection(self, where, start=start, stop=stop, **kwargs) + self.selection = Selection(self, where, start=start, stop=stop) values = self.selection.select_coords() # delete the rows in reverse order @@ -4913,7 +4964,13 @@ class Selection: """ - def __init__(self, table, where=None, start=None, stop=None): + def __init__( + self, + table: Table, + where=None, + start: Optional[int] = None, + stop: Optional[int] = None, + ): self.table = table self.where = where self.start = start From 6b3ba986c89d1701b184d93bd9d8c75b8d386164 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Wed, 20 Nov 2019 08:09:50 -0600 Subject: [PATCH 133/185] xfail clipboard for now (#29736) * xfail clipboard for now * bool --- pandas/tests/io/test_clipboard.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/tests/io/test_clipboard.py b/pandas/tests/io/test_clipboard.py index 4559ba264d8b7..666dfd245acaa 100644 --- a/pandas/tests/io/test_clipboard.py +++ b/pandas/tests/io/test_clipboard.py @@ -258,6 +258,7 @@ def test_round_trip_valid_encodings(self, enc, df): @pytest.mark.clipboard @pytest.mark.skipif(not _DEPS_INSTALLED, reason="clipboard primitives not installed") @pytest.mark.parametrize("data", ["\U0001f44d...", "Ωœ∑´...", "abcd..."]) +@pytest.mark.xfail(reason="flaky in CI", strict=False) def test_raw_roundtrip(data): # PR #25040 wide unicode wasn't copied correctly on PY3 on windows clipboard_set(data) From 7aa5d03b3047444926b6441de2b179c7c0b3e821 Mon Sep 17 00:00:00 2001 From: Brian Wignall Date: Wed, 20 Nov 2019 11:31:33 -0500 Subject: [PATCH 134/185] DOC: fix typos (#29739) --- pandas/_libs/hashing.pyx | 2 +- pandas/_libs/hashtable_func_helper.pxi.in | 4 ++-- pandas/_libs/window.pyx | 2 +- pandas/core/algorithms.py | 2 +- pandas/core/arrays/datetimelike.py | 2 +- pandas/core/series.py | 2 +- pandas/io/formats/style.py | 2 +- 7 files changed, 8 insertions(+), 8 deletions(-) diff --git a/pandas/_libs/hashing.pyx b/pandas/_libs/hashing.pyx index d3b5ecfdaa178..1906193622953 100644 --- a/pandas/_libs/hashing.pyx +++ b/pandas/_libs/hashing.pyx @@ -75,7 +75,7 @@ def hash_object_array(object[:] arr, object key, object encoding='utf8'): lens[i] = l cdata = data - # keep the references alive thru the end of the + # keep the references alive through the end of the # function datas.append(data) vecs[i] = cdata diff --git a/pandas/_libs/hashtable_func_helper.pxi.in b/pandas/_libs/hashtable_func_helper.pxi.in index c4284ae403e5c..f8f3858b803a5 100644 --- a/pandas/_libs/hashtable_func_helper.pxi.in +++ b/pandas/_libs/hashtable_func_helper.pxi.in @@ -144,13 +144,13 @@ def duplicated_{{dtype}}({{c_type}}[:] values, object keep='first'): if keep == 'last': {{if dtype == 'object'}} for i in range(n - 1, -1, -1): - # equivalent: range(n)[::-1], which cython doesnt like in nogil + # equivalent: range(n)[::-1], which cython doesn't like in nogil kh_put_{{ttype}}(table, values[i], &ret) out[i] = ret == 0 {{else}} with nogil: for i in range(n - 1, -1, -1): - # equivalent: range(n)[::-1], which cython doesnt like in nogil + # equivalent: range(n)[::-1], which cython doesn't like in nogil kh_put_{{ttype}}(table, values[i], &ret) out[i] = ret == 0 {{endif}} diff --git a/pandas/_libs/window.pyx b/pandas/_libs/window.pyx index 86b06397123b7..d6bad0f20d760 100644 --- a/pandas/_libs/window.pyx +++ b/pandas/_libs/window.pyx @@ -1914,7 +1914,7 @@ def roll_weighted_var(float64_t[:] values, float64_t[:] weights, values: float64_t[:] values to roll window over weights: float64_t[:] - array of weights whose lenght is window size + array of weights whose length is window size minp: int64_t minimum number of observations to calculate variance of a window diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index e3f1ae78efcec..9c14102529b48 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -109,7 +109,7 @@ def _ensure_data(values, dtype=None): except (TypeError, ValueError, OverflowError): # if we are trying to coerce to a dtype - # and it is incompat this will fall thru to here + # and it is incompat this will fall through to here return ensure_object(values), "object" # datetimelike diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index e52bc17fcc319..8e66db4c61032 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -1486,7 +1486,7 @@ def mean(self, skipna=True): values = self if not len(values): - # short-circut for empty max / min + # short-circuit for empty max / min return NaT result = nanops.nanmean(values.view("i8"), skipna=skipna) diff --git a/pandas/core/series.py b/pandas/core/series.py index 3f69dd53491c1..14056c99bd686 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -2087,7 +2087,7 @@ def idxmin(self, axis=0, skipna=True, *args, **kwargs): will be NA. *args, **kwargs Additional arguments and keywords have no effect but might be - accepted for compatability with NumPy. + accepted for compatibility with NumPy. Returns ------- diff --git a/pandas/io/formats/style.py b/pandas/io/formats/style.py index dce0afd8670b2..6fc4e21d33d16 100644 --- a/pandas/io/formats/style.py +++ b/pandas/io/formats/style.py @@ -1450,7 +1450,7 @@ def _get_level_lengths(index, hidden_elements=None): Optional argument is a list of index positions which should not be visible. - Result is a dictionary of (level, inital_position): span + Result is a dictionary of (level, initial_position): span """ sentinel = object() levels = index.format(sparsify=sentinel, adjoin=False, names=False) From 054a42dd6c817e27d65d6df8a42cd41fbfb09fa5 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Wed, 20 Nov 2019 08:33:47 -0800 Subject: [PATCH 135/185] DEPR: remove tsplot (#29726) --- doc/source/whatsnew/v1.0.0.rst | 1 + pandas/plotting/__init__.py | 1 - pandas/plotting/_misc.py | 28 ----------- pandas/tests/plotting/test_datetimelike.py | 57 ---------------------- pandas/tseries/plotting.py | 3 -- 5 files changed, 1 insertion(+), 89 deletions(-) delete mode 100644 pandas/tseries/plotting.py diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index f158c1158b54e..318fe62a73ed2 100644 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -318,6 +318,7 @@ or ``matplotlib.Axes.plot``. See :ref:`plotting.formatters` for more. - Removed the previously deprecated ``IntervalIndex.from_intervals`` in favor of the :class:`IntervalIndex` constructor (:issue:`19263`) - Changed the default value for the "keep_tz" argument in :meth:`DatetimeIndex.to_series` to ``True`` (:issue:`23739`) - Ability to read pickles containing :class:`Categorical` instances created with pre-0.16 version of pandas has been removed (:issue:`27538`) +- Removed previously deprecated :func:`pandas.tseries.plotting.tsplot` (:issue:`18627`) - Removed the previously deprecated ``reduce`` and ``broadcast`` arguments from :meth:`DataFrame.apply` (:issue:`18577`) - Removed the previously deprecated ``assert_raises_regex`` function in ``pandas.util.testing`` (:issue:`29174`) - Removed :meth:`Index.is_lexsorted_for_tuple` (:issue:`29305`) diff --git a/pandas/plotting/__init__.py b/pandas/plotting/__init__.py index ebe047c58b889..55c861e384d67 100644 --- a/pandas/plotting/__init__.py +++ b/pandas/plotting/__init__.py @@ -38,7 +38,6 @@ - hist_series and hist_frame (for `Series.hist` and `DataFrame.hist`) - boxplot (`pandas.plotting.boxplot(df)` equivalent to `DataFrame.boxplot`) - boxplot_frame and boxplot_frame_groupby -- tsplot (deprecated) - register and deregister (register converters for the tick formats) - Plots not called as `Series` and `DataFrame` methods: - table diff --git a/pandas/plotting/_misc.py b/pandas/plotting/_misc.py index 6c8bcdada5957..b8f5a0d83b5c1 100644 --- a/pandas/plotting/_misc.py +++ b/pandas/plotting/_misc.py @@ -1,5 +1,4 @@ from contextlib import contextmanager -import warnings from pandas.util._decorators import deprecate_kwarg @@ -426,33 +425,6 @@ def autocorrelation_plot(series, ax=None, **kwargs): return plot_backend.autocorrelation_plot(series=series, ax=ax, **kwargs) -def tsplot(series, plotf, ax=None, **kwargs): - """ - Plots a Series on the given Matplotlib axes or the current axes - - Parameters - ---------- - axes : Axes - series : Series - - Notes - _____ - Supports same kwargs as Axes.plot - - - .. deprecated:: 0.23.0 - Use Series.plot() instead - """ - warnings.warn( - "'tsplot' is deprecated and will be removed in a " - "future version. Please use Series.plot() instead.", - FutureWarning, - stacklevel=2, - ) - plot_backend = _get_plot_backend("matplotlib") - return plot_backend.tsplot(series=series, plotf=plotf, ax=ax, **kwargs) - - class _Options(dict): """ Stores pandas plotting options. diff --git a/pandas/tests/plotting/test_datetimelike.py b/pandas/tests/plotting/test_datetimelike.py index 973bda8292b2a..f5161b481ca50 100644 --- a/pandas/tests/plotting/test_datetimelike.py +++ b/pandas/tests/plotting/test_datetimelike.py @@ -99,33 +99,12 @@ def test_nonnumeric_exclude(self): with pytest.raises(TypeError, match=msg): df["A"].plot() - def test_tsplot_deprecated(self): - from pandas.tseries.plotting import tsplot - - _, ax = self.plt.subplots() - ts = tm.makeTimeSeries() - - with tm.assert_produces_warning(FutureWarning): - tsplot(ts, self.plt.Axes.plot, ax=ax) - @pytest.mark.slow def test_tsplot(self): - from pandas.tseries.plotting import tsplot - _, ax = self.plt.subplots() ts = tm.makeTimeSeries() - def f(*args, **kwds): - with tm.assert_produces_warning(FutureWarning): - return tsplot(s, self.plt.Axes.plot, *args, **kwds) - - for s in self.period_ser: - _check_plot_works(f, s.index.freq, ax=ax, series=s) - - for s in self.datetime_ser: - _check_plot_works(f, s.index.freq.rule_code, ax=ax, series=s) - for s in self.period_ser: _check_plot_works(s.plot, ax=ax) @@ -194,17 +173,6 @@ def check_format_of_first_point(ax, expected_string): check_format_of_first_point(ax, "t = 2014-01-01 y = 1.000000") tm.close() - # tsplot - from pandas.tseries.plotting import tsplot - - _, ax = self.plt.subplots() - with tm.assert_produces_warning(FutureWarning): - tsplot(annual, self.plt.Axes.plot, ax=ax) - check_format_of_first_point(ax, "t = 2014 y = 1.000000") - with tm.assert_produces_warning(FutureWarning): - tsplot(daily, self.plt.Axes.plot, ax=ax) - check_format_of_first_point(ax, "t = 2014-01-01 y = 1.000000") - @pytest.mark.slow def test_line_plot_period_series(self): for s in self.period_ser: @@ -892,16 +860,6 @@ def test_to_weekly_resampling(self): for l in ax.get_lines(): assert PeriodIndex(data=l.get_xdata()).freq == idxh.freq - _, ax = self.plt.subplots() - from pandas.tseries.plotting import tsplot - - with tm.assert_produces_warning(FutureWarning): - tsplot(high, self.plt.Axes.plot, ax=ax) - with tm.assert_produces_warning(FutureWarning): - lines = tsplot(low, self.plt.Axes.plot, ax=ax) - for l in lines: - assert PeriodIndex(data=l.get_xdata()).freq == idxh.freq - @pytest.mark.slow def test_from_weekly_resampling(self): idxh = date_range("1/1/1999", periods=52, freq="W") @@ -926,21 +884,6 @@ def test_from_weekly_resampling(self): tm.assert_numpy_array_equal(xdata, expected_h) tm.close() - _, ax = self.plt.subplots() - from pandas.tseries.plotting import tsplot - - with tm.assert_produces_warning(FutureWarning): - tsplot(low, self.plt.Axes.plot, ax=ax) - with tm.assert_produces_warning(FutureWarning): - lines = tsplot(high, self.plt.Axes.plot, ax=ax) - for l in lines: - assert PeriodIndex(data=l.get_xdata()).freq == idxh.freq - xdata = l.get_xdata(orig=False) - if len(xdata) == 12: # idxl lines - tm.assert_numpy_array_equal(xdata, expected_l) - else: - tm.assert_numpy_array_equal(xdata, expected_h) - @pytest.mark.slow def test_from_resampling_area_line_mixed(self): idxh = date_range("1/1/1999", periods=52, freq="W") diff --git a/pandas/tseries/plotting.py b/pandas/tseries/plotting.py deleted file mode 100644 index df41b4b5b40d9..0000000000000 --- a/pandas/tseries/plotting.py +++ /dev/null @@ -1,3 +0,0 @@ -# flake8: noqa - -from pandas.plotting._matplotlib.timeseries import tsplot From 70b65520c0f47760394a2d261df6628fe98b0cd9 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Wed, 20 Nov 2019 08:43:02 -0800 Subject: [PATCH 136/185] DEPR: remove deprecated keywords in read_excel, to_records (#29721) --- doc/source/whatsnew/v1.0.0.rst | 2 ++ pandas/core/frame.py | 30 +++++---------------------- pandas/io/excel/_base.py | 9 +------- pandas/tests/frame/test_convert_to.py | 13 ------------ pandas/tests/io/excel/test_readers.py | 8 ------- 5 files changed, 8 insertions(+), 54 deletions(-) diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index 318fe62a73ed2..a3d17b2b32353 100644 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -315,6 +315,8 @@ or ``matplotlib.Axes.plot``. See :ref:`plotting.formatters` for more. - :meth:`pandas.Series.str.cat` now defaults to aligning ``others``, using ``join='left'`` (:issue:`27611`) - :meth:`pandas.Series.str.cat` does not accept list-likes *within* list-likes anymore (:issue:`27611`) - Removed the previously deprecated :meth:`ExtensionArray._formatting_values`. Use :attr:`ExtensionArray._formatter` instead. (:issue:`23601`) +- :func:`read_excel` removed support for "skip_footer" argument, use "skipfooter" instead (:issue:`18836`) +- :meth:`DataFrame.to_records` no longer supports the argument "convert_datetime64" (:issue:`18902`) - Removed the previously deprecated ``IntervalIndex.from_intervals`` in favor of the :class:`IntervalIndex` constructor (:issue:`19263`) - Changed the default value for the "keep_tz" argument in :meth:`DatetimeIndex.to_series` to ``True`` (:issue:`23739`) - Ability to read pickles containing :class:`Categorical` instances created with pre-0.16 version of pandas has been removed (:issue:`27538`) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 5baba0bae1d45..cb74fff51e7a3 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -66,7 +66,6 @@ ensure_platform_int, infer_dtype_from_object, is_bool_dtype, - is_datetime64_any_dtype, is_dict_like, is_dtype_equal, is_extension_array_dtype, @@ -1685,9 +1684,7 @@ def from_records( return cls(mgr) - def to_records( - self, index=True, convert_datetime64=None, column_dtypes=None, index_dtypes=None - ): + def to_records(self, index=True, column_dtypes=None, index_dtypes=None): """ Convert DataFrame to a NumPy record array. @@ -1699,11 +1696,6 @@ def to_records( index : bool, default True Include index in resulting record array, stored in 'index' field or using the index label, if set. - convert_datetime64 : bool, default None - .. deprecated:: 0.23.0 - - Whether to convert the index to datetime.datetime if it is a - DatetimeIndex. column_dtypes : str, type, dict, default None .. versionadded:: 0.24.0 @@ -1778,24 +1770,12 @@ def to_records( dtype=[('I', 'S1'), ('A', ' Date: Wed, 20 Nov 2019 09:07:18 -0800 Subject: [PATCH 137/185] DEPR: remove nthreads kwarg from read_feather (#29728) --- doc/source/whatsnew/v1.0.0.rst | 1 + pandas/io/feather_format.py | 7 ------- pandas/tests/io/test_feather.py | 17 ----------------- 3 files changed, 1 insertion(+), 24 deletions(-) diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index a3d17b2b32353..c3ff0e83938fa 100644 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -323,6 +323,7 @@ or ``matplotlib.Axes.plot``. See :ref:`plotting.formatters` for more. - Removed previously deprecated :func:`pandas.tseries.plotting.tsplot` (:issue:`18627`) - Removed the previously deprecated ``reduce`` and ``broadcast`` arguments from :meth:`DataFrame.apply` (:issue:`18577`) - Removed the previously deprecated ``assert_raises_regex`` function in ``pandas.util.testing`` (:issue:`29174`) +- Removed previously deprecated "nthreads" argument from :func:`read_feather`, use "use_threads" instead (:issue:`23053`) - Removed :meth:`Index.is_lexsorted_for_tuple` (:issue:`29305`) - Removed support for nexted renaming in :meth:`DataFrame.aggregate`, :meth:`Series.aggregate`, :meth:`DataFrameGroupBy.aggregate`, :meth:`SeriesGroupBy.aggregate`, :meth:`Rolling.aggregate` (:issue:`29608`) - Removed previously deprecated "order" argument from :func:`factorize` (:issue:`19751`) diff --git a/pandas/io/feather_format.py b/pandas/io/feather_format.py index d9e88f42c2ef2..dffe04fb63720 100644 --- a/pandas/io/feather_format.py +++ b/pandas/io/feather_format.py @@ -3,7 +3,6 @@ from distutils.version import LooseVersion from pandas.compat._optional import import_optional_dependency -from pandas.util._decorators import deprecate_kwarg from pandas import DataFrame, Int64Index, RangeIndex @@ -66,7 +65,6 @@ def to_feather(df: DataFrame, path): feather.write_feather(df, path) -@deprecate_kwarg(old_arg_name="nthreads", new_arg_name="use_threads") def read_feather(path, columns=None, use_threads=True): """ Load a feather-format object from the file path. @@ -89,11 +87,6 @@ def read_feather(path, columns=None, use_threads=True): If not provided, all columns are read. .. versionadded:: 0.24.0 - nthreads : int, default 1 - Number of CPU threads to use when reading to pandas.DataFrame. - - .. versionadded:: 0.21.0 - .. deprecated:: 0.24.0 use_threads : bool, default True Whether to parallelize reading using multiple threads. diff --git a/pandas/tests/io/test_feather.py b/pandas/tests/io/test_feather.py index 0f68a6534dad1..e06f2c31a2870 100644 --- a/pandas/tests/io/test_feather.py +++ b/pandas/tests/io/test_feather.py @@ -107,23 +107,6 @@ def test_unsupported_other(self): # Some versions raise ValueError, others raise ArrowInvalid. self.check_error_on_write(df, Exception) - def test_rw_nthreads(self): - df = pd.DataFrame({"A": np.arange(100000)}) - expected_warning = ( - "the 'nthreads' keyword is deprecated, use 'use_threads' instead" - ) - # TODO: make the warning work with check_stacklevel=True - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False) as w: - self.check_round_trip(df, nthreads=2) - # we have an extra FutureWarning because of #GH23752 - assert any(expected_warning in str(x) for x in w) - - # TODO: make the warning work with check_stacklevel=True - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False) as w: - self.check_round_trip(df, nthreads=1) - # we have an extra FutureWarnings because of #GH23752 - assert any(expected_warning in str(x) for x in w) - def test_rw_use_threads(self): df = pd.DataFrame({"A": np.arange(100000)}) self.check_round_trip(df, use_threads=True) From 71334472766fc95e7dc828dce2bfe798f6bb19dc Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Wed, 20 Nov 2019 09:11:18 -0800 Subject: [PATCH 138/185] DOC: fix _validate_names docstring (#29729) --- pandas/io/parsers.py | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 2cb4a5c8bb2f6..ff3583b79d79c 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -395,25 +395,22 @@ def _validate_integer(name, val, min_val=0): def _validate_names(names): """ - Check if the `names` parameter contains duplicates. - - If duplicates are found, we issue a warning before returning. + Raise ValueError if the `names` parameter contains duplicates. Parameters ---------- names : array-like or None An array containing a list of the names used for the output DataFrame. - Returns - ------- - names : array-like or None - The original `names` parameter. + Raises + ------ + ValueError + If names are not unique. """ if names is not None: if len(names) != len(set(names)): raise ValueError("Duplicate names are not allowed.") - return names def _read(filepath_or_buffer: FilePathOrBuffer, kwds): From 9e7a73310b206d89b9096331bcf419a733dde018 Mon Sep 17 00:00:00 2001 From: ganevgv Date: Wed, 20 Nov 2019 17:14:28 +0000 Subject: [PATCH 139/185] TST: add test for indexing with single/double tuples (#29448) --- pandas/tests/frame/indexing/test_indexing.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/pandas/tests/frame/indexing/test_indexing.py b/pandas/tests/frame/indexing/test_indexing.py index 24a431fe42cf8..9a7cd4ace686f 100644 --- a/pandas/tests/frame/indexing/test_indexing.py +++ b/pandas/tests/frame/indexing/test_indexing.py @@ -2624,6 +2624,17 @@ def test_index_namedtuple(self): result = df.loc[IndexType("foo", "bar")]["A"] assert result == 1 + @pytest.mark.parametrize("tpl", [tuple([1]), tuple([1, 2])]) + def test_index_single_double_tuples(self, tpl): + # GH 20991 + idx = pd.Index([tuple([1]), tuple([1, 2])], name="A", tupleize_cols=False) + df = DataFrame(index=idx) + + result = df.loc[[tpl]] + idx = pd.Index([tpl], name="A", tupleize_cols=False) + expected = DataFrame(index=idx) + tm.assert_frame_equal(result, expected) + def test_boolean_indexing(self): idx = list(range(3)) cols = ["A", "B", "C"] From 84fcbb883b5b2716b981fffc4a3a97fd60b0ae5d Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Wed, 20 Nov 2019 09:15:09 -0800 Subject: [PATCH 140/185] DEPR: remove Series.from_array, DataFrame.from_items, as_matrix, asobject, as_blocks, blocks (#29720) --- doc/source/reference/frame.rst | 1 - doc/source/whatsnew/v1.0.0.rst | 6 ++ pandas/core/frame.py | 107 +-------------------- pandas/core/generic.py | 85 ---------------- pandas/core/series.py | 56 +---------- pandas/tests/frame/test_api.py | 8 -- pandas/tests/frame/test_block_internals.py | 10 +- pandas/tests/frame/test_constructors.py | 87 ----------------- pandas/tests/series/test_api.py | 5 - pandas/tests/series/test_dtypes.py | 6 -- pandas/tests/series/test_timeseries.py | 4 - 11 files changed, 13 insertions(+), 362 deletions(-) diff --git a/doc/source/reference/frame.rst b/doc/source/reference/frame.rst index 4b5faed0f4d2d..37d27093efefd 100644 --- a/doc/source/reference/frame.rst +++ b/doc/source/reference/frame.rst @@ -351,7 +351,6 @@ Serialization / IO / conversion :toctree: api/ DataFrame.from_dict - DataFrame.from_items DataFrame.from_records DataFrame.info DataFrame.to_parquet diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index c3ff0e83938fa..7ac1830af568c 100644 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -312,6 +312,12 @@ or ``matplotlib.Axes.plot``. See :ref:`plotting.formatters` for more. - Removed the previously deprecated :meth:`Series.get_value`, :meth:`Series.set_value`, :meth:`DataFrame.get_value`, :meth:`DataFrame.set_value` (:issue:`17739`) - Changed the the default value of `inplace` in :meth:`DataFrame.set_index` and :meth:`Series.set_axis`. It now defaults to False (:issue:`27600`) +- Removed support for nested renaming in :meth:`DataFrame.aggregate`, :meth:`Series.aggregate`, :meth:`DataFrameGroupBy.aggregate`, :meth:`SeriesGroupBy.aggregate`, :meth:`Rolling.aggregate` (:issue:`18529`) +- Removed :meth:`Series.from_array` (:issue:`18258`) +- Removed :meth:`DataFrame.from_items` (:issue:`18458`) +- Removed :meth:`DataFrame.as_matrix`, :meth:`Series.as_matrix` (:issue:`18458`) +- Removed :meth:`Series.asobject` (:issue:`18477`) +- Removed :meth:`DataFrame.as_blocks`, :meth:`Series.as_blocks`, `DataFrame.blocks`, :meth:`Series.blocks` (:issue:`17656`) - :meth:`pandas.Series.str.cat` now defaults to aligning ``others``, using ``join='left'`` (:issue:`27611`) - :meth:`pandas.Series.str.cat` does not accept list-likes *within* list-likes anymore (:issue:`27611`) - Removed the previously deprecated :meth:`ExtensionArray._formatting_values`. Use :attr:`ExtensionArray._formatter` instead. (:issue:`23601`) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index cb74fff51e7a3..dbf9f1b2082a3 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -76,7 +76,6 @@ is_iterator, is_list_like, is_named_tuple, - is_nested_list_like, is_object_dtype, is_scalar, is_sequence, @@ -342,8 +341,9 @@ class DataFrame(NDFrame): -------- DataFrame.from_records : Constructor from tuples, also record arrays. DataFrame.from_dict : From dicts of Series, arrays, or dicts. - DataFrame.from_items : From sequence of (key, value) pairs - read_csv, pandas.read_table, pandas.read_clipboard. + read_csv + read_table + read_clipboard Examples -------- @@ -387,9 +387,7 @@ def _constructor(self) -> Type["DataFrame"]: return DataFrame _constructor_sliced = Series # type: Type[Series] - _deprecations = NDFrame._deprecations | frozenset( - ["from_items"] - ) # type: FrozenSet[str] + _deprecations = NDFrame._deprecations | frozenset([]) # type: FrozenSet[str] _accessors = set() # type: Set[str] @property @@ -1850,103 +1848,6 @@ def to_records(self, index=True, column_dtypes=None, index_dtypes=None): return np.rec.fromarrays(arrays, dtype={"names": names, "formats": formats}) - @classmethod - def from_items(cls, items, columns=None, orient="columns"): - """ - Construct a DataFrame from a list of tuples. - - .. deprecated:: 0.23.0 - `from_items` is deprecated and will be removed in a future version. - Use :meth:`DataFrame.from_dict(dict(items)) ` - instead. - :meth:`DataFrame.from_dict(OrderedDict(items)) ` - may be used to preserve the key order. - - Convert (key, value) pairs to DataFrame. The keys will be the axis - index (usually the columns, but depends on the specified - orientation). The values should be arrays or Series. - - Parameters - ---------- - items : sequence of (key, value) pairs - Values should be arrays or Series. - columns : sequence of column labels, optional - Must be passed if orient='index'. - orient : {'columns', 'index'}, default 'columns' - The "orientation" of the data. If the keys of the - input correspond to column labels, pass 'columns' - (default). Otherwise if the keys correspond to the index, - pass 'index'. - - Returns - ------- - DataFrame - """ - - warnings.warn( - "from_items is deprecated. Please use " - "DataFrame.from_dict(dict(items), ...) instead. " - "DataFrame.from_dict(OrderedDict(items)) may be used to " - "preserve the key order.", - FutureWarning, - stacklevel=2, - ) - - keys, values = zip(*items) - - if orient == "columns": - if columns is not None: - columns = ensure_index(columns) - - idict = dict(items) - if len(idict) < len(items): - if not columns.equals(ensure_index(keys)): - raise ValueError( - "With non-unique item names, passed " - "columns must be identical" - ) - arrays = values - else: - arrays = [idict[k] for k in columns if k in idict] - else: - columns = ensure_index(keys) - arrays = values - - # GH 17312 - # Provide more informative error msg when scalar values passed - try: - return cls._from_arrays(arrays, columns, None) - - except ValueError: - if not is_nested_list_like(values): - raise ValueError( - "The value in each (key, value) pair " - "must be an array, Series, or dict" - ) - - elif orient == "index": - if columns is None: - raise TypeError("Must pass columns with orient='index'") - - keys = ensure_index(keys) - - # GH 17312 - # Provide more informative error msg when scalar values passed - try: - arr = np.array(values, dtype=object).T - data = [lib.maybe_convert_objects(v) for v in arr] - return cls._from_arrays(data, columns, keys) - - except TypeError: - if not is_nested_list_like(values): - raise ValueError( - "The value in each (key, value) pair " - "must be an array, Series, or dict" - ) - - else: # pragma: no cover - raise ValueError("'orient' must be either 'columns' or 'index'") - @classmethod def _from_arrays(cls, arrays, columns, index, dtype=None): mgr = arrays_to_mgr(arrays, columns, index, columns, dtype=dtype) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 4f45a96d23941..6fbe95fa973cb 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -173,9 +173,6 @@ class NDFrame(PandasObject, SelectionMixin): _accessors = set() # type: Set[str] _deprecations = frozenset( [ - "as_blocks", - "as_matrix", - "blocks", "clip_lower", "clip_upper", "get_dtype_counts", @@ -5409,54 +5406,6 @@ def _get_bool_data(self): # ---------------------------------------------------------------------- # Internal Interface Methods - def as_matrix(self, columns=None): - """ - Convert the frame to its Numpy-array representation. - - .. deprecated:: 0.23.0 - Use :meth:`DataFrame.values` instead. - - Parameters - ---------- - columns : list, optional, default:None - If None, return all columns, otherwise, returns specified columns. - - Returns - ------- - values : ndarray - If the caller is heterogeneous and contains booleans or objects, - the result will be of dtype=object. See Notes. - - See Also - -------- - DataFrame.values - - Notes - ----- - Return is NOT a Numpy-matrix, rather, a Numpy-array. - - The dtype will be a lower-common-denominator dtype (implicit - upcasting); that is to say if the dtypes (even of numeric types) - are mixed, the one that accommodates all will be chosen. Use this - with care if you are not dealing with the blocks. - - e.g. If the dtypes are float16 and float32, dtype will be upcast to - float32. If dtypes are int32 and uint8, dtype will be upcase to - int32. By numpy.find_common_type convention, mixing int64 and uint64 - will result in a float64 dtype. - - This method is provided for backwards compatibility. Generally, - it is recommended to use '.values'. - """ - warnings.warn( - "Method .as_matrix will be removed in a future version. " - "Use .values instead.", - FutureWarning, - stacklevel=2, - ) - self._consolidate_inplace() - return self._data.as_array(transpose=self._AXIS_REVERSED, items=columns) - @property def values(self): """ @@ -5774,40 +5723,6 @@ def ftypes(self): return Series(self._data.get_ftypes(), index=self._info_axis, dtype=np.object_) - def as_blocks(self, copy=True): - """ - Convert the frame to a dict of dtype -> Constructor Types. - - .. deprecated:: 0.21.0 - - NOTE: the dtypes of the blocks WILL BE PRESERVED HERE (unlike in - as_matrix) - - Parameters - ---------- - copy : bool, default True - - Returns - ------- - dict - Mapping dtype -> Constructor Types. - """ - warnings.warn( - "as_blocks is deprecated and will be removed in a future version", - FutureWarning, - stacklevel=2, - ) - return self._to_dict_of_blocks(copy=copy) - - @property - def blocks(self): - """ - Internal property, property synonym for as_blocks(). - - .. deprecated:: 0.21.0 - """ - return self.as_blocks() - def _to_dict_of_blocks(self, copy=True): """ Return a dict of dtype -> Constructor Types that diff --git a/pandas/core/series.py b/pandas/core/series.py index 14056c99bd686..c10871d04ef3e 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -176,17 +176,7 @@ class Series(base.IndexOpsMixin, generic.NDFrame): base.IndexOpsMixin._deprecations | generic.NDFrame._deprecations | frozenset( - [ - "asobject", - "compress", - "valid", - "ftype", - "real", - "imag", - "put", - "ptp", - "nonzero", - ] + ["compress", "valid", "ftype", "real", "imag", "put", "ptp", "nonzero"] ) ) @@ -364,32 +354,6 @@ def _init_dict(self, data, index=None, dtype=None): s = s.reindex(index, copy=False) return s._data, s.index - @classmethod - def from_array( - cls, arr, index=None, name=None, dtype=None, copy=False, fastpath=False - ): - """ - Construct Series from array. - - .. deprecated:: 0.23.0 - Use pd.Series(..) constructor instead. - - Returns - ------- - Series - Constructed Series. - """ - warnings.warn( - "'from_array' is deprecated and will be removed in a " - "future version. Please use the pd.Series(..) " - "constructor instead.", - FutureWarning, - stacklevel=2, - ) - return cls( - arr, index=index, name=name, dtype=dtype, copy=copy, fastpath=fastpath - ) - # ---------------------------------------------------------------------- @property @@ -579,24 +543,6 @@ def get_values(self): def _internal_get_values(self): return self._data.get_values() - @property - def asobject(self): - """ - Return object Series which contains boxed values. - - .. deprecated:: 0.23.0 - - Use ``astype(object)`` instead. - - *this is an internal non-public method* - """ - warnings.warn( - "'asobject' is deprecated. Use 'astype(object)' instead", - FutureWarning, - stacklevel=2, - ) - return self.astype(object).values - # ops def ravel(self, order="C"): """ diff --git a/pandas/tests/frame/test_api.py b/pandas/tests/frame/test_api.py index 50b1dec21c549..a86e1dfe8353c 100644 --- a/pandas/tests/frame/test_api.py +++ b/pandas/tests/frame/test_api.py @@ -476,14 +476,6 @@ def test_values(self, float_frame): float_frame.values[:, 0] = 5.0 assert (float_frame.values[:, 0] == 5).all() - def test_as_matrix_deprecated(self, float_frame): - # GH 18458 - with tm.assert_produces_warning(FutureWarning): - cols = float_frame.columns.tolist() - result = float_frame.as_matrix(columns=cols) - expected = float_frame.values - tm.assert_numpy_array_equal(result, expected) - def test_deepcopy(self, float_frame): cp = deepcopy(float_frame) series = cp["A"] diff --git a/pandas/tests/frame/test_block_internals.py b/pandas/tests/frame/test_block_internals.py index b45c074f179a0..d491e9f25c897 100644 --- a/pandas/tests/frame/test_block_internals.py +++ b/pandas/tests/frame/test_block_internals.py @@ -313,10 +313,7 @@ def test_copy_blocks(self, float_frame): column = df.columns[0] # use the default copy=True, change a column - - # deprecated 0.21.0 - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - blocks = df.as_blocks() + blocks = df._to_dict_of_blocks(copy=True) for dtype, _df in blocks.items(): if column in _df: _df.loc[:, column] = _df[column] + 1 @@ -330,10 +327,7 @@ def test_no_copy_blocks(self, float_frame): column = df.columns[0] # use the copy=False, change a column - - # deprecated 0.21.0 - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - blocks = df.as_blocks(copy=False) + blocks = df._to_dict_of_blocks(copy=False) for dtype, _df in blocks.items(): if column in _df: _df.loc[:, column] = _df[column] + 1 diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py index cc2e37c14bdf0..ce0ebdbe56354 100644 --- a/pandas/tests/frame/test_constructors.py +++ b/pandas/tests/frame/test_constructors.py @@ -10,7 +10,6 @@ from pandas.compat import is_platform_little_endian -from pandas.core.dtypes.cast import construct_1d_object_array_from_listlike from pandas.core.dtypes.common import is_integer_dtype import pandas as pd @@ -1508,92 +1507,6 @@ def test_constructor_manager_resize(self, float_frame): tm.assert_index_equal(result.index, Index(index)) tm.assert_index_equal(result.columns, Index(columns)) - def test_constructor_from_items(self, float_frame, float_string_frame): - items = [(c, float_frame[c]) for c in float_frame.columns] - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - recons = DataFrame.from_items(items) - tm.assert_frame_equal(recons, float_frame) - - # pass some columns - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - recons = DataFrame.from_items(items, columns=["C", "B", "A"]) - tm.assert_frame_equal(recons, float_frame.loc[:, ["C", "B", "A"]]) - - # orient='index' - - row_items = [ - (idx, float_string_frame.xs(idx)) for idx in float_string_frame.index - ] - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - recons = DataFrame.from_items( - row_items, columns=float_string_frame.columns, orient="index" - ) - tm.assert_frame_equal(recons, float_string_frame) - assert recons["A"].dtype == np.float64 - - msg = "Must pass columns with orient='index'" - with pytest.raises(TypeError, match=msg): - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - DataFrame.from_items(row_items, orient="index") - - # orient='index', but thar be tuples - arr = construct_1d_object_array_from_listlike( - [("bar", "baz")] * len(float_string_frame) - ) - float_string_frame["foo"] = arr - row_items = [ - (idx, list(float_string_frame.xs(idx))) for idx in float_string_frame.index - ] - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - recons = DataFrame.from_items( - row_items, columns=float_string_frame.columns, orient="index" - ) - tm.assert_frame_equal(recons, float_string_frame) - assert isinstance(recons["foo"][0], tuple) - - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - rs = DataFrame.from_items( - [("A", [1, 2, 3]), ("B", [4, 5, 6])], - orient="index", - columns=["one", "two", "three"], - ) - xp = DataFrame( - [[1, 2, 3], [4, 5, 6]], index=["A", "B"], columns=["one", "two", "three"] - ) - tm.assert_frame_equal(rs, xp) - - def test_constructor_from_items_scalars(self): - # GH 17312 - msg = ( - r"The value in each \(key, value\) " - "pair must be an array, Series, or dict" - ) - with pytest.raises(ValueError, match=msg): - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - DataFrame.from_items([("A", 1), ("B", 4)]) - - msg = ( - r"The value in each \(key, value\) " - "pair must be an array, Series, or dict" - ) - with pytest.raises(ValueError, match=msg): - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - DataFrame.from_items( - [("A", 1), ("B", 2)], columns=["col1"], orient="index" - ) - - def test_from_items_deprecation(self): - # GH 17320 - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - DataFrame.from_items([("A", [1, 2, 3]), ("B", [4, 5, 6])]) - - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - DataFrame.from_items( - [("A", [1, 2, 3]), ("B", [4, 5, 6])], - columns=["col1", "col2", "col3"], - orient="index", - ) - def test_constructor_mix_series_nonseries(self, float_frame): df = DataFrame( {"A": float_frame["A"], "B": list(float_frame["B"])}, columns=["A", "B"] diff --git a/pandas/tests/series/test_api.py b/pandas/tests/series/test_api.py index 00c66c8a13bd9..1e4757ffecb5d 100644 --- a/pandas/tests/series/test_api.py +++ b/pandas/tests/series/test_api.py @@ -199,11 +199,6 @@ def test_constructor_dict_timedelta_index(self): ) self._assert_series_equal(result, expected) - def test_from_array_deprecated(self): - - with tm.assert_produces_warning(FutureWarning, check_stacklevel=True): - self.series_klass.from_array([1, 2, 3]) - def test_sparse_accessor_updates_on_inplace(self): s = pd.Series([1, 1, 2, 3], dtype="Sparse[int]") s.drop([0, 1], inplace=True) diff --git a/pandas/tests/series/test_dtypes.py b/pandas/tests/series/test_dtypes.py index 4b03115c11cb3..e1ace952f722d 100644 --- a/pandas/tests/series/test_dtypes.py +++ b/pandas/tests/series/test_dtypes.py @@ -44,12 +44,6 @@ def test_astype(self, dtype): assert as_typed.dtype == dtype assert as_typed.name == s.name - def test_asobject_deprecated(self): - s = Series(np.random.randn(5), name="foo") - with tm.assert_produces_warning(FutureWarning): - o = s.asobject - assert isinstance(o, np.ndarray) - def test_dtype(self, datetime_series): assert datetime_series.dtype == np.dtype("float64") diff --git a/pandas/tests/series/test_timeseries.py b/pandas/tests/series/test_timeseries.py index cf06a9a7c8415..1587ae5eb7d07 100644 --- a/pandas/tests/series/test_timeseries.py +++ b/pandas/tests/series/test_timeseries.py @@ -1040,10 +1040,6 @@ def test_from_M8_structured(self): assert isinstance(s[0], Timestamp) assert s[0] == dates[0][0] - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - s = Series.from_array(arr["Date"], Index([0])) - assert s[0] == dates[0][0] - def test_get_level_values_box(self): from pandas import MultiIndex From eddd9f09a76628e1842cc19634b3f9b9f3b0fe83 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Wed, 20 Nov 2019 09:15:59 -0800 Subject: [PATCH 141/185] REF: ensure name and cname are always str (#29692) --- pandas/io/pytables.py | 108 ++++++++++++++++++++++++++++-------------- 1 file changed, 72 insertions(+), 36 deletions(-) diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index 9589832095474..4c9e10e0f4601 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -1710,29 +1710,37 @@ class IndexCol: is_data_indexable = True _info_fields = ["freq", "tz", "index_name"] + name: str + cname: str + kind_attr: str + def __init__( self, + name: str, values=None, kind=None, typ=None, - cname=None, + cname: Optional[str] = None, itemsize=None, - name=None, axis=None, - kind_attr=None, + kind_attr: Optional[str] = None, pos=None, freq=None, tz=None, index_name=None, **kwargs, ): + + if not isinstance(name, str): + raise ValueError("`name` must be a str.") + self.values = values self.kind = kind self.typ = typ self.itemsize = itemsize self.name = name - self.cname = cname - self.kind_attr = kind_attr + self.cname = cname or name + self.kind_attr = kind_attr or f"{name}_kind" self.axis = axis self.pos = pos self.freq = freq @@ -1742,19 +1750,14 @@ def __init__( self.meta = None self.metadata = None - if name is not None: - self.set_name(name, kind_attr) if pos is not None: self.set_pos(pos) - def set_name(self, name, kind_attr=None): - """ set the name of this indexer """ - self.name = name - self.kind_attr = kind_attr or "{name}_kind".format(name=name) - if self.cname is None: - self.cname = name - - return self + # These are ensured as long as the passed arguments match the + # constructor annotations. + assert isinstance(self.name, str) + assert isinstance(self.cname, str) + assert isinstance(self.kind_attr, str) def set_axis(self, axis: int): """ set the axis over which I index """ @@ -1771,7 +1774,6 @@ def set_pos(self, pos: int): def set_table(self, table): self.table = table - return self def __repr__(self) -> str: temp = tuple( @@ -1797,10 +1799,13 @@ def __ne__(self, other) -> bool: @property def is_indexed(self) -> bool: """ return whether I am an indexed column """ - try: - return getattr(self.table.cols, self.cname).is_indexed - except AttributeError: + if not hasattr(self.table, "cols"): + # e.g. if self.set_table hasn't been called yet, self.table + # will be None. return False + # GH#29692 mypy doesn't recognize self.table as having a "cols" attribute + # 'error: "None" has no attribute "cols"' + return getattr(self.table.cols, self.cname).is_indexed # type: ignore def copy(self): new_self = copy.copy(self) @@ -2508,6 +2513,7 @@ class DataIndexableCol(DataCol): def validate_names(self): if not Index(self.values).is_object(): + # TODO: should the message here be more specifically non-str? raise ValueError("cannot have non-object label DataIndexableCol") def get_atom_string(self, block, itemsize): @@ -2842,8 +2848,8 @@ def write_index(self, key, index): else: setattr(self.attrs, "{key}_variety".format(key=key), "regular") converted = _convert_index( - index, self.encoding, self.errors, self.format_type - ).set_name("index") + "index", index, self.encoding, self.errors, self.format_type + ) self.write_array(key, converted.values) @@ -2893,8 +2899,8 @@ def write_multi_index(self, key, index): ) level_key = "{key}_level{idx}".format(key=key, idx=i) conv_level = _convert_index( - lev, self.encoding, self.errors, self.format_type - ).set_name(level_key) + level_key, lev, self.encoding, self.errors, self.format_type + ) self.write_array(level_key, conv_level.values) node = getattr(self.group, level_key) node._v_attrs.kind = conv_level.kind @@ -3436,9 +3442,10 @@ def queryables(self): def index_cols(self): """ return a list of my index cols """ + # Note: each `i.cname` below is assured to be a str. return [(i.axis, i.cname) for i in self.index_axes] - def values_cols(self): + def values_cols(self) -> List[str]: """ return a list of my values cols """ return [i.cname for i in self.values_axes] @@ -3540,6 +3547,8 @@ def indexables(self): self._indexables = [] + # Note: each of the `name` kwargs below are str, ensured + # by the definition in index_cols. # index columns self._indexables.extend( [ @@ -3553,6 +3562,7 @@ def indexables(self): base_pos = len(self._indexables) def f(i, c): + assert isinstance(c, str) klass = DataCol if c in dc: klass = DataIndexableCol @@ -3560,6 +3570,8 @@ def f(i, c): i=i, name=c, pos=base_pos + i, version=self.version ) + # Note: the definition of `values_cols` ensures that each + # `c` below is a str. self._indexables.extend( [f(i, c) for i, c in enumerate(self.attrs.values_cols)] ) @@ -3797,11 +3809,9 @@ def create_axes( if i in axes: name = obj._AXIS_NAMES[i] - index_axes_map[i] = ( - _convert_index(a, self.encoding, self.errors, self.format_type) - .set_name(name) - .set_axis(i) - ) + index_axes_map[i] = _convert_index( + name, a, self.encoding, self.errors, self.format_type + ).set_axis(i) else: # we might be able to change the axes on the appending data if @@ -3900,6 +3910,9 @@ def get_blk_items(mgr, blocks): if data_columns and len(b_items) == 1 and b_items[0] in data_columns: klass = DataIndexableCol name = b_items[0] + if not (name is None or isinstance(name, str)): + # TODO: should the message here be more specifically non-str? + raise ValueError("cannot have non-object label DataIndexableCol") self.data_columns.append(name) # make sure that we match up the existing columns @@ -4582,6 +4595,7 @@ def indexables(self): self._indexables = [GenericIndexCol(name="index", axis=0)] for i, n in enumerate(d._v_names): + assert isinstance(n, str) dc = GenericDataIndexableCol( name=n, pos=i, values=[n], version=self.version @@ -4700,12 +4714,15 @@ def _set_tz(values, tz, preserve_UTC: bool = False, coerce: bool = False): return values -def _convert_index(index, encoding=None, errors="strict", format_type=None): +def _convert_index(name: str, index, encoding=None, errors="strict", format_type=None): + assert isinstance(name, str) + index_name = getattr(index, "name", None) if isinstance(index, DatetimeIndex): converted = index.asi8 return IndexCol( + name, converted, "datetime64", _tables().Int64Col(), @@ -4716,6 +4733,7 @@ def _convert_index(index, encoding=None, errors="strict", format_type=None): elif isinstance(index, TimedeltaIndex): converted = index.asi8 return IndexCol( + name, converted, "timedelta64", _tables().Int64Col(), @@ -4726,6 +4744,7 @@ def _convert_index(index, encoding=None, errors="strict", format_type=None): atom = _tables().Int64Col() # avoid to store ndarray of Period objects return IndexCol( + name, index._ndarray_values, "integer", atom, @@ -4743,6 +4762,7 @@ def _convert_index(index, encoding=None, errors="strict", format_type=None): if inferred_type == "datetime64": converted = values.view("i8") return IndexCol( + name, converted, "datetime64", _tables().Int64Col(), @@ -4753,6 +4773,7 @@ def _convert_index(index, encoding=None, errors="strict", format_type=None): elif inferred_type == "timedelta64": converted = values.view("i8") return IndexCol( + name, converted, "timedelta64", _tables().Int64Col(), @@ -4765,11 +4786,13 @@ def _convert_index(index, encoding=None, errors="strict", format_type=None): dtype=np.float64, ) return IndexCol( - converted, "datetime", _tables().Time64Col(), index_name=index_name + name, converted, "datetime", _tables().Time64Col(), index_name=index_name ) elif inferred_type == "date": converted = np.asarray([v.toordinal() for v in values], dtype=np.int32) - return IndexCol(converted, "date", _tables().Time32Col(), index_name=index_name) + return IndexCol( + name, converted, "date", _tables().Time32Col(), index_name=index_name, + ) elif inferred_type == "string": # atom = _tables().ObjectAtom() # return np.asarray(values, dtype='O'), 'object', atom @@ -4777,6 +4800,7 @@ def _convert_index(index, encoding=None, errors="strict", format_type=None): converted = _convert_string_array(values, encoding, errors) itemsize = converted.dtype.itemsize return IndexCol( + name, converted, "string", _tables().StringCol(itemsize), @@ -4787,7 +4811,11 @@ def _convert_index(index, encoding=None, errors="strict", format_type=None): if format_type == "fixed": atom = _tables().ObjectAtom() return IndexCol( - np.asarray(values, dtype="O"), "object", atom, index_name=index_name + name, + np.asarray(values, dtype="O"), + "object", + atom, + index_name=index_name, ) raise TypeError( "[unicode] is not supported as a in index type for [{0}] formats".format( @@ -4799,17 +4827,25 @@ def _convert_index(index, encoding=None, errors="strict", format_type=None): # take a guess for now, hope the values fit atom = _tables().Int64Col() return IndexCol( - np.asarray(values, dtype=np.int64), "integer", atom, index_name=index_name + name, + np.asarray(values, dtype=np.int64), + "integer", + atom, + index_name=index_name, ) elif inferred_type == "floating": atom = _tables().Float64Col() return IndexCol( - np.asarray(values, dtype=np.float64), "float", atom, index_name=index_name + name, + np.asarray(values, dtype=np.float64), + "float", + atom, + index_name=index_name, ) else: # pragma: no cover atom = _tables().ObjectAtom() return IndexCol( - np.asarray(values, dtype="O"), "object", atom, index_name=index_name + name, np.asarray(values, dtype="O"), "object", atom, index_name=index_name, ) From 75815b2aead6dcdc80a771a993366153ca0703ac Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Wed, 20 Nov 2019 21:03:51 +0000 Subject: [PATCH 142/185] CI: fix imminent mypy failure (#29747) --- pandas/io/pytables.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index 4c9e10e0f4601..7c447cbf78677 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -2044,6 +2044,8 @@ def convert( the underlying table's row count are normalized to that. """ + assert self.table is not None + _start = start if start is not None else 0 _stop = min(stop, self.table.nrows) if stop is not None else self.table.nrows self.values = Int64Index(np.arange(_stop - _start)) From e08047461da1dffd55f5c23a3d6819c416c59c5a Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Wed, 20 Nov 2019 13:05:18 -0800 Subject: [PATCH 143/185] DEPR: remove reduce kwd from DataFrame.apply (#29730) --- pandas/core/frame.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index dbf9f1b2082a3..464067b9988bd 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -6474,9 +6474,7 @@ def transform(self, func, axis=0, *args, **kwargs): return self.T.transform(func, *args, **kwargs).T return super().transform(func, *args, **kwargs) - def apply( - self, func, axis=0, raw=False, reduce=None, result_type=None, args=(), **kwds - ): + def apply(self, func, axis=0, raw=False, result_type=None, args=(), **kwds): """ Apply a function along an axis of the DataFrame. From 6d11aa89fc06ebc55519507f6652dcfcba35a302 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Wed, 20 Nov 2019 13:07:00 -0800 Subject: [PATCH 144/185] DEPR: remove encoding kwarg from read_stata, DataFrame.to_stata (#29722) --- doc/source/whatsnew/v1.0.0.rst | 1 + pandas/core/frame.py | 11 +---------- pandas/io/stata.py | 18 ------------------ pandas/tests/io/test_stata.py | 8 ++------ 4 files changed, 4 insertions(+), 34 deletions(-) diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index 7ac1830af568c..3b87150f544cf 100644 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -334,6 +334,7 @@ or ``matplotlib.Axes.plot``. See :ref:`plotting.formatters` for more. - Removed support for nexted renaming in :meth:`DataFrame.aggregate`, :meth:`Series.aggregate`, :meth:`DataFrameGroupBy.aggregate`, :meth:`SeriesGroupBy.aggregate`, :meth:`Rolling.aggregate` (:issue:`29608`) - Removed previously deprecated "order" argument from :func:`factorize` (:issue:`19751`) - Removed previously deprecated "v" argument from :meth:`FrozenNDarray.searchsorted`, use "value" instead (:issue:`22672`) +- :func:`read_stata` and :meth:`DataFrame.to_stata` no longer supports the "encoding" argument (:issue:`21400`) - Removed previously deprecated "raise_conflict" argument from :meth:`DataFrame.update`, use "errors" instead (:issue:`23585`) - Removed previously deprecated keyword "n" from :meth:`DatetimeIndex.shift`, :meth:`TimedeltaIndex.shift`, :meth:`PeriodIndex.shift`, use "periods" instead (:issue:`22458`) - diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 464067b9988bd..8b31b6d503eda 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -35,12 +35,7 @@ from pandas._libs import algos as libalgos, lib from pandas.compat.numpy import function as nv -from pandas.util._decorators import ( - Appender, - Substitution, - deprecate_kwarg, - rewrite_axis_style_signature, -) +from pandas.util._decorators import Appender, Substitution, rewrite_axis_style_signature from pandas.util._validators import ( validate_axis_style_args, validate_bool_kwarg, @@ -1853,13 +1848,11 @@ def _from_arrays(cls, arrays, columns, index, dtype=None): mgr = arrays_to_mgr(arrays, columns, index, columns, dtype=dtype) return cls(mgr) - @deprecate_kwarg(old_arg_name="encoding", new_arg_name=None) def to_stata( self, fname, convert_dates=None, write_index=True, - encoding="latin-1", byteorder=None, time_stamp=None, data_label=None, @@ -1889,8 +1882,6 @@ def to_stata( a datetime column has timezone information. write_index : bool Write the index to Stata dataset. - encoding : str - Default is latin-1. Unicode is not supported. byteorder : str Can be ">", "<", "little", or "big". default is `sys.byteorder`. time_stamp : datetime diff --git a/pandas/io/stata.py b/pandas/io/stata.py index 24539057a5db9..567eeb7f5cdc8 100644 --- a/pandas/io/stata.py +++ b/pandas/io/stata.py @@ -58,10 +58,6 @@ convert_categoricals : bool, default True Read value labels and convert columns to Categorical/Factor variables.""" -_encoding_params = """\ -encoding : str, None or encoding - Encoding used to parse the files. None defaults to latin-1.""" - _statafile_processing_params2 = """\ index_col : str, optional Column to set as index. @@ -108,7 +104,6 @@ %s %s %s -%s Returns ------- @@ -132,7 +127,6 @@ ... do_something(chunk) """ % ( _statafile_processing_params1, - _encoding_params, _statafile_processing_params2, _chunksize_params, _iterator_params, @@ -189,23 +183,19 @@ %s %s %s -%s """ % ( _statafile_processing_params1, _statafile_processing_params2, - _encoding_params, _chunksize_params, ) @Appender(_read_stata_doc) -@deprecate_kwarg(old_arg_name="encoding", new_arg_name=None) @deprecate_kwarg(old_arg_name="index", new_arg_name="index_col") def read_stata( filepath_or_buffer, convert_dates=True, convert_categoricals=True, - encoding=None, index_col=None, convert_missing=False, preserve_dtypes=True, @@ -1044,7 +1034,6 @@ def __init__(self): class StataReader(StataParser, BaseIterator): __doc__ = _stata_reader_doc - @deprecate_kwarg(old_arg_name="encoding", new_arg_name=None) @deprecate_kwarg(old_arg_name="index", new_arg_name="index_col") def __init__( self, @@ -1056,7 +1045,6 @@ def __init__( preserve_dtypes=True, columns=None, order_categoricals=True, - encoding=None, chunksize=None, ): super().__init__() @@ -2134,14 +2122,12 @@ class StataWriter(StataParser): _max_string_length = 244 - @deprecate_kwarg(old_arg_name="encoding", new_arg_name=None) def __init__( self, fname, data, convert_dates=None, write_index=True, - encoding="latin-1", byteorder=None, time_stamp=None, data_label=None, @@ -2859,8 +2845,6 @@ class StataWriter117(StataWriter): timezone information write_index : bool Write the index to Stata dataset. - encoding : str - Default is latin-1. Only latin-1 and ascii are supported. byteorder : str Can be ">", "<", "little", or "big". default is `sys.byteorder` time_stamp : datetime @@ -2912,14 +2896,12 @@ class StataWriter117(StataWriter): _max_string_length = 2045 - @deprecate_kwarg(old_arg_name="encoding", new_arg_name=None) def __init__( self, fname, data, convert_dates=None, write_index=True, - encoding="latin-1", byteorder=None, time_stamp=None, data_label=None, diff --git a/pandas/tests/io/test_stata.py b/pandas/tests/io/test_stata.py index 7fa3b968278d9..2cc80a6e5565d 100644 --- a/pandas/tests/io/test_stata.py +++ b/pandas/tests/io/test_stata.py @@ -383,8 +383,7 @@ def test_encoding(self, version): # GH 4626, proper encoding handling raw = read_stata(self.dta_encoding) - with tm.assert_produces_warning(FutureWarning): - encoded = read_stata(self.dta_encoding, encoding="latin-1") + encoded = read_stata(self.dta_encoding) result = encoded.kreis1849[0] expected = raw.kreis1849[0] @@ -392,10 +391,7 @@ def test_encoding(self, version): assert isinstance(result, str) with tm.ensure_clean() as path: - with tm.assert_produces_warning(FutureWarning): - encoded.to_stata( - path, write_index=False, version=version, encoding="latin-1" - ) + encoded.to_stata(path, write_index=False, version=version) reread_encoded = read_stata(path) tm.assert_frame_equal(encoded, reread_encoded) From a46806c3995e2ddc0948f5c8c34f157c92164e42 Mon Sep 17 00:00:00 2001 From: Terji Petersen Date: Thu, 21 Nov 2019 12:19:53 +0000 Subject: [PATCH 145/185] CLN: use f-strings in core.categorical.py (#29748) --- pandas/core/arrays/categorical.py | 83 +++++++++++++------------------ 1 file changed, 35 insertions(+), 48 deletions(-) diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index c6e2a7b7a6e00..9a94345a769df 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -73,10 +73,10 @@ def _cat_compare_op(op): - opname = "__{op}__".format(op=op.__name__) + opname = f"__{op.__name__}__" @unpack_zerodim_and_defer(opname) - def f(self, other): + def func(self, other): # On python2, you can usually compare any type to any type, and # Categoricals can be seen as a custom type, but having different # results depending whether categories are the same or not is kind of @@ -137,11 +137,10 @@ def f(self, other): elif opname == "__ne__": return np.repeat(True, len(self)) else: - msg = ( - "Cannot compare a Categorical for op {op} with a " + raise TypeError( + f"Cannot compare a Categorical for op {opname} with a " "scalar, which is not a category." ) - raise TypeError(msg.format(op=opname)) else: # allow categorical vs object dtype array comparisons for equality @@ -149,16 +148,15 @@ def f(self, other): if opname in ["__eq__", "__ne__"]: return getattr(np.array(self), opname)(np.array(other)) - msg = ( - "Cannot compare a Categorical for op {op} with type {typ}." - "\nIf you want to compare values, use 'np.asarray(cat) " - " other'." + raise TypeError( + f"Cannot compare a Categorical for op {opname} with " + f"type {type(other)}.\nIf you want to compare values, " + "use 'np.asarray(cat) other'." ) - raise TypeError(msg.format(op=opname, typ=type(other))) - f.__name__ = opname + func.__name__ = opname - return f + return func def contains(cat, key, container): @@ -1060,11 +1058,9 @@ def add_categories(self, new_categories, inplace=False): new_categories = [new_categories] already_included = set(new_categories) & set(self.dtype.categories) if len(already_included) != 0: - msg = ( - "new categories must not include old categories: " - "{already_included!s}" + raise ValueError( + f"new categories must not include old categories: {already_included}" ) - raise ValueError(msg.format(already_included=already_included)) new_categories = list(self.dtype.categories) + list(new_categories) new_dtype = CategoricalDtype(new_categories, self.ordered) @@ -1120,8 +1116,7 @@ def remove_categories(self, removals, inplace=False): new_categories = [x for x in new_categories if notna(x)] if len(not_included) != 0: - msg = "removals must all be in old categories: {not_included!s}" - raise ValueError(msg.format(not_included=not_included)) + raise ValueError(f"removals must all be in old categories: {not_included}") return self.set_categories( new_categories, ordered=self.ordered, rename=False, inplace=inplace @@ -1299,9 +1294,8 @@ def shift(self, periods, fill_value=None): fill_value = self.categories.get_loc(fill_value) else: raise ValueError( - "'fill_value={}' is not present " - "in this Categorical's " - "categories".format(fill_value) + f"'fill_value={fill_value}' is not present " + "in this Categorical's categories" ) if periods > 0: codes[:periods] = fill_value @@ -1342,8 +1336,8 @@ def __array_ufunc__(self, ufunc, method, *inputs, **kwargs): # for all other cases, raise for now (similarly as what happens in # Series.__array_prepare__) raise TypeError( - "Object with dtype {dtype} cannot perform " - "the numpy op {op}".format(dtype=self.dtype, op=ufunc.__name__) + f"Object with dtype {self.dtype} cannot perform " + f"the numpy op {ufunc.__name__}" ) def __setstate__(self, state): @@ -1542,9 +1536,9 @@ def check_for_ordered(self, op): """ assert that we are ordered """ if not self.ordered: raise TypeError( - "Categorical is not ordered for operation {op}\n" + f"Categorical is not ordered for operation {op}\n" "you can use .as_ordered() to change the " - "Categorical to an ordered one\n".format(op=op) + "Categorical to an ordered one\n" ) def _values_for_argsort(self): @@ -1679,8 +1673,7 @@ def sort_values(self, inplace=False, ascending=True, na_position="last"): """ inplace = validate_bool_kwarg(inplace, "inplace") if na_position not in ["last", "first"]: - msg = "invalid na_position: {na_position!r}" - raise ValueError(msg.format(na_position=na_position)) + raise ValueError(f"invalid na_position: {na_position!r}") sorted_idx = nargsort(self, ascending=ascending, na_position=na_position) @@ -1836,8 +1829,7 @@ def fillna(self, value=None, method=None, limit=None): else: raise TypeError( '"value" parameter must be a scalar, dict ' - "or Series, but you passed a " - '"{0}"'.format(type(value).__name__) + f'or Series, but you passed a {type(value).__name__!r}"' ) return self._constructor(codes, dtype=self.dtype, fastpath=True) @@ -1930,8 +1922,11 @@ def take_nd(self, indexer, allow_fill=None, fill_value=None): if fill_value in self.categories: fill_value = self.categories.get_loc(fill_value) else: - msg = "'fill_value' ('{}') is not in this Categorical's categories." - raise TypeError(msg.format(fill_value)) + msg = ( + f"'fill_value' ('{fill_value}') is not in this " + "Categorical's categories." + ) + raise TypeError(msg) codes = take(self._codes, indexer, allow_fill=allow_fill, fill_value=fill_value) result = type(self).from_codes(codes, dtype=dtype) @@ -1969,11 +1964,9 @@ def _tidy_repr(self, max_vals=10, footer=True): head = self[:num]._get_repr(length=False, footer=False) tail = self[-(max_vals - num) :]._get_repr(length=False, footer=False) - result = "{head}, ..., {tail}".format(head=head[:-1], tail=tail[1:]) + result = f"{head[:-1]}, ..., {tail[1:]}" if footer: - result = "{result}\n{footer}".format( - result=result, footer=self._repr_footer() - ) + result = f"{result}\n{self._repr_footer()}" return str(result) @@ -2007,9 +2000,7 @@ def _repr_categories_info(self): category_strs = self._repr_categories() dtype = str(self.categories.dtype) - levheader = "Categories ({length}, {dtype}): ".format( - length=len(self.categories), dtype=dtype - ) + levheader = f"Categories ({len(self.categories)}, {dtype}): " width, height = get_terminal_size() max_width = get_option("display.width") or width if console.in_ipython_frontend(): @@ -2033,10 +2024,8 @@ def _repr_categories_info(self): return levheader + "[" + levstring.replace(" < ... < ", " ... ") + "]" def _repr_footer(self): - - return "Length: {length}\n{info}".format( - length=len(self), info=self._repr_categories_info() - ) + info = self._repr_categories_info() + return f"Length: {len(self)}\n{info}" def _get_repr(self, length=True, na_rep="NaN", footer=True): from pandas.io.formats import format as fmt @@ -2058,7 +2047,7 @@ def __repr__(self) -> str: result = self._get_repr(length=len(self) > _maxlen) else: msg = self._get_repr(length=False, footer=True).replace("\n", ", ") - result = "[], {repr_msg}".format(repr_msg=msg) + result = f"[], {msg}" return result @@ -2189,8 +2178,7 @@ def _reverse_indexer(self): def _reduce(self, name, axis=0, **kwargs): func = getattr(self, name, None) if func is None: - msg = "Categorical cannot perform the operation {op}" - raise TypeError(msg.format(op=name)) + raise TypeError(f"Categorical cannot perform the operation {name}") return func(**kwargs) def min(self, numeric_only=None, **kwargs): @@ -2458,11 +2446,10 @@ def isin(self, values): array([ True, False, True, False, True, False]) """ if not is_list_like(values): + values_type = type(values).__name__ raise TypeError( "only list-like objects are allowed to be passed" - " to isin(), you passed a [{values_type}]".format( - values_type=type(values).__name__ - ) + f" to isin(), you passed a [{values_type}]" ) values = sanitize_array(values, None, None) null_mask = np.asarray(isna(values)) From 6e5d14834072e7856987eb31e574b2a05db9f0b9 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke Date: Thu, 21 Nov 2019 04:59:30 -0800 Subject: [PATCH 146/185] REF: Separate window bounds calculation from aggregation functions (#29428) --- pandas/_libs/window.pyx | 965 ++++++++++++-------------------- pandas/_libs/window_indexer.pyx | 165 ++++++ pandas/core/window/common.py | 49 +- pandas/core/window/rolling.py | 280 +++++---- setup.py | 2 + 5 files changed, 741 insertions(+), 720 deletions(-) create mode 100644 pandas/_libs/window_indexer.pyx diff --git a/pandas/_libs/window.pyx b/pandas/_libs/window.pyx index d6bad0f20d760..303b4f6f24eac 100644 --- a/pandas/_libs/window.pyx +++ b/pandas/_libs/window.pyx @@ -96,280 +96,20 @@ def _check_minp(win, minp, N, floor=None) -> int: # Physical description: 366 p. # Series: Prentice-Hall Series in Automatic Computation -# ---------------------------------------------------------------------- -# The indexer objects for rolling -# These define start/end indexers to compute offsets - - -cdef class WindowIndexer: - - cdef: - ndarray start, end - int64_t N, minp, win - bint is_variable - - def get_data(self): - return (self.start, self.end, self.N, - self.win, self.minp, - self.is_variable) - - -cdef class MockFixedWindowIndexer(WindowIndexer): - """ - - We are just checking parameters of the indexer, - and returning a consistent API with fixed/variable - indexers. - - Parameters - ---------- - values: ndarray - values data array - win: int64_t - window size - minp: int64_t - min number of obs in a window to consider non-NaN - index: object - index of the values - floor: optional - unit for flooring - left_closed: bint - left endpoint closedness - right_closed: bint - right endpoint closedness - - """ - def __init__(self, ndarray values, int64_t win, int64_t minp, - bint left_closed, bint right_closed, - object index=None, object floor=None): - - assert index is None - self.is_variable = 0 - self.N = len(values) - self.minp = _check_minp(win, minp, self.N, floor=floor) - self.start = np.empty(0, dtype='int64') - self.end = np.empty(0, dtype='int64') - self.win = win - - -cdef class FixedWindowIndexer(WindowIndexer): - """ - create a fixed length window indexer object - that has start & end, that point to offsets in - the index object; these are defined based on the win - arguments - - Parameters - ---------- - values: ndarray - values data array - win: int64_t - window size - minp: int64_t - min number of obs in a window to consider non-NaN - index: object - index of the values - floor: optional - unit for flooring the unit - left_closed: bint - left endpoint closedness - right_closed: bint - right endpoint closedness - - """ - def __init__(self, ndarray values, int64_t win, int64_t minp, - bint left_closed, bint right_closed, - object index=None, object floor=None): - cdef: - ndarray[int64_t] start_s, start_e, end_s, end_e - - assert index is None - self.is_variable = 0 - self.N = len(values) - self.minp = _check_minp(win, minp, self.N, floor=floor) - - start_s = np.zeros(win, dtype='int64') - start_e = np.arange(win, self.N, dtype='int64') - win + 1 - self.start = np.concatenate([start_s, start_e]) - - end_s = np.arange(win, dtype='int64') + 1 - end_e = start_e + win - self.end = np.concatenate([end_s, end_e]) - self.win = win - - -cdef class VariableWindowIndexer(WindowIndexer): - """ - create a variable length window indexer object - that has start & end, that point to offsets in - the index object; these are defined based on the win - arguments - - Parameters - ---------- - values: ndarray - values data array - win: int64_t - window size - minp: int64_t - min number of obs in a window to consider non-NaN - index: ndarray - index of the values - left_closed: bint - left endpoint closedness - True if the left endpoint is closed, False if open - right_closed: bint - right endpoint closedness - True if the right endpoint is closed, False if open - floor: optional - unit for flooring the unit - """ - def __init__(self, ndarray values, int64_t win, int64_t minp, - bint left_closed, bint right_closed, ndarray index, - object floor=None): - - self.is_variable = 1 - self.N = len(index) - self.minp = _check_minp(win, minp, self.N, floor=floor) - - self.start = np.empty(self.N, dtype='int64') - self.start.fill(-1) - - self.end = np.empty(self.N, dtype='int64') - self.end.fill(-1) - - self.build(index, win, left_closed, right_closed) - - # max window size - self.win = (self.end - self.start).max() - - def build(self, const int64_t[:] index, int64_t win, bint left_closed, - bint right_closed): - - cdef: - ndarray[int64_t] start, end - int64_t start_bound, end_bound, N - Py_ssize_t i, j - - start = self.start - end = self.end - N = self.N - - start[0] = 0 - - # right endpoint is closed - if right_closed: - end[0] = 1 - # right endpoint is open - else: - end[0] = 0 - - with nogil: - - # start is start of slice interval (including) - # end is end of slice interval (not including) - for i in range(1, N): - end_bound = index[i] - start_bound = index[i] - win - - # left endpoint is closed - if left_closed: - start_bound -= 1 - - # advance the start bound until we are - # within the constraint - start[i] = i - for j in range(start[i - 1], i): - if index[j] > start_bound: - start[i] = j - break - - # end bound is previous end - # or current index - if index[end[i - 1]] <= end_bound: - end[i] = i + 1 - else: - end[i] = end[i - 1] - - # right endpoint is open - if not right_closed: - end[i] -= 1 - - -def get_window_indexer(values, win, minp, index, closed, - floor=None, use_mock=True): - """ - Return the correct window indexer for the computation. - - Parameters - ---------- - values: 1d ndarray - win: integer, window size - minp: integer, minimum periods - index: 1d ndarray, optional - index to the values array - closed: string, default None - {'right', 'left', 'both', 'neither'} - window endpoint closedness. Defaults to 'right' in - VariableWindowIndexer and to 'both' in FixedWindowIndexer - floor: optional - unit for flooring the unit - use_mock: boolean, default True - if we are a fixed indexer, return a mock indexer - instead of the FixedWindow Indexer. This is a type - compat Indexer that allows us to use a standard - code path with all of the indexers. - - Returns - ------- - tuple of 1d int64 ndarrays of the offsets & data about the window - - """ - - cdef: - bint left_closed = False - bint right_closed = False - - assert closed is None or closed in ['right', 'left', 'both', 'neither'] - - # if windows is variable, default is 'right', otherwise default is 'both' - if closed is None: - closed = 'right' if index is not None else 'both' - - if closed in ['right', 'both']: - right_closed = True - - if closed in ['left', 'both']: - left_closed = True - - if index is not None: - indexer = VariableWindowIndexer(values, win, minp, left_closed, - right_closed, index, floor) - elif use_mock: - indexer = MockFixedWindowIndexer(values, win, minp, left_closed, - right_closed, index, floor) - else: - indexer = FixedWindowIndexer(values, win, minp, left_closed, - right_closed, index, floor) - return indexer.get_data() - # ---------------------------------------------------------------------- # Rolling count # this is only an impl for index not None, IOW, freq aware -def roll_count(ndarray[float64_t] values, int64_t win, int64_t minp, - object index, object closed): +def roll_count(ndarray[float64_t] values, ndarray[int64_t] start, ndarray[int64_t] end, + int64_t minp): cdef: float64_t val, count_x = 0.0 - int64_t s, e, nobs, N + int64_t s, e, nobs, N = len(values) Py_ssize_t i, j - int64_t[:] start, end ndarray[float64_t] output - start, end, N, win, minp, _ = get_window_indexer(values, win, - minp, index, closed) output = np.empty(N, dtype=float) with nogil: @@ -442,80 +182,75 @@ cdef inline void remove_sum(float64_t val, int64_t *nobs, float64_t *sum_x) nogi sum_x[0] = sum_x[0] - val -def roll_sum(ndarray[float64_t] values, int64_t win, int64_t minp, - object index, object closed): +def roll_sum_variable(ndarray[float64_t] values, ndarray[int64_t] start, + ndarray[int64_t] end, int64_t minp): cdef: - float64_t val, prev_x, sum_x = 0 - int64_t s, e, range_endpoint - int64_t nobs = 0, i, j, N - bint is_variable - int64_t[:] start, end + float64_t sum_x = 0 + int64_t s, e + int64_t nobs = 0, i, j, N = len(values) ndarray[float64_t] output - start, end, N, win, minp, is_variable = get_window_indexer(values, win, - minp, index, - closed, - floor=0) output = np.empty(N, dtype=float) - # for performance we are going to iterate - # fixed windows separately, makes the code more complex as we have 2 paths - # but is faster + with nogil: - if is_variable: + for i in range(0, N): + s = start[i] + e = end[i] - # variable window - with nogil: + if i == 0: - for i in range(0, N): - s = start[i] - e = end[i] + # setup + sum_x = 0.0 + nobs = 0 + for j in range(s, e): + add_sum(values[j], &nobs, &sum_x) - if i == 0: + else: - # setup - sum_x = 0.0 - nobs = 0 - for j in range(s, e): - add_sum(values[j], &nobs, &sum_x) + # calculate deletes + for j in range(start[i - 1], s): + remove_sum(values[j], &nobs, &sum_x) - else: + # calculate adds + for j in range(end[i - 1], e): + add_sum(values[j], &nobs, &sum_x) - # calculate deletes - for j in range(start[i - 1], s): - remove_sum(values[j], &nobs, &sum_x) + output[i] = calc_sum(minp, nobs, sum_x) - # calculate adds - for j in range(end[i - 1], e): - add_sum(values[j], &nobs, &sum_x) + return output - output[i] = calc_sum(minp, nobs, sum_x) - else: +def roll_sum_fixed(ndarray[float64_t] values, ndarray[int64_t] start, + ndarray[int64_t] end, int64_t minp, int64_t win): + cdef: + float64_t val, prev_x, sum_x = 0 + int64_t range_endpoint + int64_t nobs = 0, i, N = len(values) + ndarray[float64_t] output - # fixed window + output = np.empty(N, dtype=float) - range_endpoint = int_max(minp, 1) - 1 + range_endpoint = int_max(minp, 1) - 1 - with nogil: + with nogil: - for i in range(0, range_endpoint): - add_sum(values[i], &nobs, &sum_x) - output[i] = NaN + for i in range(0, range_endpoint): + add_sum(values[i], &nobs, &sum_x) + output[i] = NaN - for i in range(range_endpoint, N): - val = values[i] - add_sum(val, &nobs, &sum_x) + for i in range(range_endpoint, N): + val = values[i] + add_sum(val, &nobs, &sum_x) - if i > win - 1: - prev_x = values[i - win] - remove_sum(prev_x, &nobs, &sum_x) + if i > win - 1: + prev_x = values[i - win] + remove_sum(prev_x, &nobs, &sum_x) - output[i] = calc_sum(minp, nobs, sum_x) + output[i] = calc_sum(minp, nobs, sum_x) return output - # ---------------------------------------------------------------------- # Rolling mean @@ -563,77 +298,75 @@ cdef inline void remove_mean(float64_t val, Py_ssize_t *nobs, float64_t *sum_x, neg_ct[0] = neg_ct[0] - 1 -def roll_mean(ndarray[float64_t] values, int64_t win, int64_t minp, - object index, object closed): +def roll_mean_fixed(ndarray[float64_t] values, ndarray[int64_t] start, + ndarray[int64_t] end, int64_t minp, int64_t win): cdef: - float64_t val, prev_x, result, sum_x = 0 - int64_t s, e - bint is_variable - Py_ssize_t nobs = 0, i, j, neg_ct = 0, N - int64_t[:] start, end + float64_t val, prev_x, sum_x = 0 + Py_ssize_t nobs = 0, i, neg_ct = 0, N = len(values) ndarray[float64_t] output - start, end, N, win, minp, is_variable = get_window_indexer(values, win, - minp, index, - closed) output = np.empty(N, dtype=float) - # for performance we are going to iterate - # fixed windows separately, makes the code more complex as we have 2 paths - # but is faster + with nogil: + for i in range(minp - 1): + val = values[i] + add_mean(val, &nobs, &sum_x, &neg_ct) + output[i] = NaN + + for i in range(minp - 1, N): + val = values[i] + add_mean(val, &nobs, &sum_x, &neg_ct) - if is_variable: + if i > win - 1: + prev_x = values[i - win] + remove_mean(prev_x, &nobs, &sum_x, &neg_ct) - with nogil: + output[i] = calc_mean(minp, nobs, neg_ct, sum_x) - for i in range(0, N): - s = start[i] - e = end[i] + return output - if i == 0: - # setup - sum_x = 0.0 - nobs = 0 - for j in range(s, e): - val = values[j] - add_mean(val, &nobs, &sum_x, &neg_ct) +def roll_mean_variable(ndarray[float64_t] values, ndarray[int64_t] start, + ndarray[int64_t] end, int64_t minp): + cdef: + float64_t val, sum_x = 0 + int64_t s, e + Py_ssize_t nobs = 0, i, j, neg_ct = 0, N = len(values) + ndarray[float64_t] output - else: + output = np.empty(N, dtype=float) - # calculate deletes - for j in range(start[i - 1], s): - val = values[j] - remove_mean(val, &nobs, &sum_x, &neg_ct) + with nogil: - # calculate adds - for j in range(end[i - 1], e): - val = values[j] - add_mean(val, &nobs, &sum_x, &neg_ct) + for i in range(0, N): + s = start[i] + e = end[i] - output[i] = calc_mean(minp, nobs, neg_ct, sum_x) + if i == 0: - else: + # setup + sum_x = 0.0 + nobs = 0 + for j in range(s, e): + val = values[j] + add_mean(val, &nobs, &sum_x, &neg_ct) - with nogil: - for i in range(minp - 1): - val = values[i] - add_mean(val, &nobs, &sum_x, &neg_ct) - output[i] = NaN + else: - for i in range(minp - 1, N): - val = values[i] - add_mean(val, &nobs, &sum_x, &neg_ct) + # calculate deletes + for j in range(start[i - 1], s): + val = values[j] + remove_mean(val, &nobs, &sum_x, &neg_ct) - if i > win - 1: - prev_x = values[i - win] - remove_mean(prev_x, &nobs, &sum_x, &neg_ct) + # calculate adds + for j in range(end[i - 1], e): + val = values[j] + add_mean(val, &nobs, &sum_x, &neg_ct) - output[i] = calc_mean(minp, nobs, neg_ct, sum_x) + output[i] = calc_mean(minp, nobs, neg_ct, sum_x) return output - # ---------------------------------------------------------------------- # Rolling variance @@ -696,8 +429,8 @@ cdef inline void remove_var(float64_t val, float64_t *nobs, float64_t *mean_x, ssqdm_x[0] = 0 -def roll_var(ndarray[float64_t] values, int64_t win, int64_t minp, - object index, object closed, int ddof=1): +def roll_var_fixed(ndarray[float64_t] values, ndarray[int64_t] start, + ndarray[int64_t] end, int64_t minp, int64_t win, int ddof=1): """ Numerically stable implementation using Welford's method. """ @@ -705,98 +438,102 @@ def roll_var(ndarray[float64_t] values, int64_t win, int64_t minp, float64_t mean_x = 0, ssqdm_x = 0, nobs = 0, float64_t val, prev, delta, mean_x_old int64_t s, e - bint is_variable - Py_ssize_t i, j, N - int64_t[:] start, end + Py_ssize_t i, j, N = len(values) ndarray[float64_t] output - start, end, N, win, minp, is_variable = get_window_indexer(values, win, - minp, index, - closed) output = np.empty(N, dtype=float) # Check for windows larger than array, addresses #7297 win = min(win, N) - # for performance we are going to iterate - # fixed windows separately, makes the code more complex as we - # have 2 paths but is faster + with nogil: - if is_variable: + # Over the first window, observations can only be added, never + # removed + for i in range(win): + add_var(values[i], &nobs, &mean_x, &ssqdm_x) + output[i] = calc_var(minp, ddof, nobs, ssqdm_x) - with nogil: + # a part of Welford's method for the online variance-calculation + # https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance - for i in range(0, N): + # After the first window, observations can both be added and + # removed + for i in range(win, N): + val = values[i] + prev = values[i - win] - s = start[i] - e = end[i] + if notnan(val): + if prev == prev: - # Over the first window, observations can only be added - # never removed - if i == 0: + # Adding one observation and removing another one + delta = val - prev + mean_x_old = mean_x - for j in range(s, e): - add_var(values[j], &nobs, &mean_x, &ssqdm_x) + mean_x += delta / nobs + ssqdm_x += ((nobs - 1) * val + + (nobs + 1) * prev + - 2 * nobs * mean_x_old) * delta / nobs else: + add_var(val, &nobs, &mean_x, &ssqdm_x) + elif prev == prev: + remove_var(prev, &nobs, &mean_x, &ssqdm_x) - # After the first window, observations can both be added - # and removed + output[i] = calc_var(minp, ddof, nobs, ssqdm_x) + + return output - # calculate adds - for j in range(end[i - 1], e): - add_var(values[j], &nobs, &mean_x, &ssqdm_x) - # calculate deletes - for j in range(start[i - 1], s): - remove_var(values[j], &nobs, &mean_x, &ssqdm_x) +def roll_var_variable(ndarray[float64_t] values, ndarray[int64_t] start, + ndarray[int64_t] end, int64_t minp, int ddof=1): + """ + Numerically stable implementation using Welford's method. + """ + cdef: + float64_t mean_x = 0, ssqdm_x = 0, nobs = 0, + float64_t val, prev, delta, mean_x_old + int64_t s, e + Py_ssize_t i, j, N = len(values) + ndarray[float64_t] output - output[i] = calc_var(minp, ddof, nobs, ssqdm_x) + output = np.empty(N, dtype=float) - else: + with nogil: - with nogil: + for i in range(0, N): - # Over the first window, observations can only be added, never - # removed - for i in range(win): - add_var(values[i], &nobs, &mean_x, &ssqdm_x) - output[i] = calc_var(minp, ddof, nobs, ssqdm_x) + s = start[i] + e = end[i] - # a part of Welford's method for the online variance-calculation - # https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance + # Over the first window, observations can only be added + # never removed + if i == 0: - # After the first window, observations can both be added and - # removed - for i in range(win, N): - val = values[i] - prev = values[i - win] + for j in range(s, e): + add_var(values[j], &nobs, &mean_x, &ssqdm_x) - if notnan(val): - if prev == prev: + else: - # Adding one observation and removing another one - delta = val - prev - mean_x_old = mean_x + # After the first window, observations can both be added + # and removed - mean_x += delta / nobs - ssqdm_x += ((nobs - 1) * val - + (nobs + 1) * prev - - 2 * nobs * mean_x_old) * delta / nobs + # calculate adds + for j in range(end[i - 1], e): + add_var(values[j], &nobs, &mean_x, &ssqdm_x) - else: - add_var(val, &nobs, &mean_x, &ssqdm_x) - elif prev == prev: - remove_var(prev, &nobs, &mean_x, &ssqdm_x) + # calculate deletes + for j in range(start[i - 1], s): + remove_var(values[j], &nobs, &mean_x, &ssqdm_x) - output[i] = calc_var(minp, ddof, nobs, ssqdm_x) + output[i] = calc_var(minp, ddof, nobs, ssqdm_x) return output - # ---------------------------------------------------------------------- # Rolling skewness + cdef inline float64_t calc_skew(int64_t minp, int64_t nobs, float64_t x, float64_t xx, float64_t xxx) nogil: @@ -861,76 +598,80 @@ cdef inline void remove_skew(float64_t val, int64_t *nobs, xxx[0] = xxx[0] - val * val * val -def roll_skew(ndarray[float64_t] values, int64_t win, int64_t minp, - object index, object closed): +def roll_skew_fixed(ndarray[float64_t] values, ndarray[int64_t] start, + ndarray[int64_t] end, int64_t minp, int64_t win): cdef: float64_t val, prev float64_t x = 0, xx = 0, xxx = 0 - int64_t nobs = 0, i, j, N + int64_t nobs = 0, i, j, N = len(values) int64_t s, e - bint is_variable - int64_t[:] start, end ndarray[float64_t] output - start, end, N, win, minp, is_variable = get_window_indexer(values, win, - minp, index, - closed) output = np.empty(N, dtype=float) - if is_variable: + with nogil: + for i in range(minp - 1): + val = values[i] + add_skew(val, &nobs, &x, &xx, &xxx) + output[i] = NaN - with nogil: + for i in range(minp - 1, N): + val = values[i] + add_skew(val, &nobs, &x, &xx, &xxx) - for i in range(0, N): + if i > win - 1: + prev = values[i - win] + remove_skew(prev, &nobs, &x, &xx, &xxx) - s = start[i] - e = end[i] + output[i] = calc_skew(minp, nobs, x, xx, xxx) - # Over the first window, observations can only be added - # never removed - if i == 0: + return output - for j in range(s, e): - val = values[j] - add_skew(val, &nobs, &x, &xx, &xxx) - else: +def roll_skew_variable(ndarray[float64_t] values, ndarray[int64_t] start, + ndarray[int64_t] end, int64_t minp): + cdef: + float64_t val, prev + float64_t x = 0, xx = 0, xxx = 0 + int64_t nobs = 0, i, j, N = len(values) + int64_t s, e + ndarray[float64_t] output + + output = np.empty(N, dtype=float) - # After the first window, observations can both be added - # and removed + with nogil: - # calculate adds - for j in range(end[i - 1], e): - val = values[j] - add_skew(val, &nobs, &x, &xx, &xxx) + for i in range(0, N): - # calculate deletes - for j in range(start[i - 1], s): - val = values[j] - remove_skew(val, &nobs, &x, &xx, &xxx) + s = start[i] + e = end[i] - output[i] = calc_skew(minp, nobs, x, xx, xxx) + # Over the first window, observations can only be added + # never removed + if i == 0: - else: + for j in range(s, e): + val = values[j] + add_skew(val, &nobs, &x, &xx, &xxx) - with nogil: - for i in range(minp - 1): - val = values[i] - add_skew(val, &nobs, &x, &xx, &xxx) - output[i] = NaN + else: - for i in range(minp - 1, N): - val = values[i] - add_skew(val, &nobs, &x, &xx, &xxx) + # After the first window, observations can both be added + # and removed - if i > win - 1: - prev = values[i - win] - remove_skew(prev, &nobs, &x, &xx, &xxx) + # calculate adds + for j in range(end[i - 1], e): + val = values[j] + add_skew(val, &nobs, &x, &xx, &xxx) - output[i] = calc_skew(minp, nobs, x, xx, xxx) + # calculate deletes + for j in range(start[i - 1], s): + val = values[j] + remove_skew(val, &nobs, &x, &xx, &xxx) - return output + output[i] = calc_skew(minp, nobs, x, xx, xxx) + return output # ---------------------------------------------------------------------- # Rolling kurtosis @@ -1005,69 +746,73 @@ cdef inline void remove_kurt(float64_t val, int64_t *nobs, xxxx[0] = xxxx[0] - val * val * val * val -def roll_kurt(ndarray[float64_t] values, int64_t win, int64_t minp, - object index, object closed): +def roll_kurt_fixed(ndarray[float64_t] values, ndarray[int64_t] start, + ndarray[int64_t] end, int64_t minp, int64_t win): cdef: float64_t val, prev float64_t x = 0, xx = 0, xxx = 0, xxxx = 0 - int64_t nobs = 0, i, j, N + int64_t nobs = 0, i, j, N = len(values) int64_t s, e - bint is_variable - int64_t[:] start, end ndarray[float64_t] output - start, end, N, win, minp, is_variable = get_window_indexer(values, win, - minp, index, - closed) output = np.empty(N, dtype=float) - if is_variable: + with nogil: - with nogil: + for i in range(minp - 1): + add_kurt(values[i], &nobs, &x, &xx, &xxx, &xxxx) + output[i] = NaN - for i in range(0, N): + for i in range(minp - 1, N): + add_kurt(values[i], &nobs, &x, &xx, &xxx, &xxxx) - s = start[i] - e = end[i] + if i > win - 1: + prev = values[i - win] + remove_kurt(prev, &nobs, &x, &xx, &xxx, &xxxx) - # Over the first window, observations can only be added - # never removed - if i == 0: + output[i] = calc_kurt(minp, nobs, x, xx, xxx, xxxx) - for j in range(s, e): - add_kurt(values[j], &nobs, &x, &xx, &xxx, &xxxx) + return output - else: - # After the first window, observations can both be added - # and removed +def roll_kurt_variable(ndarray[float64_t] values, ndarray[int64_t] start, + ndarray[int64_t] end, int64_t minp): + cdef: + float64_t val, prev + float64_t x = 0, xx = 0, xxx = 0, xxxx = 0 + int64_t nobs = 0, i, j, s, e, N = len(values) + ndarray[float64_t] output - # calculate adds - for j in range(end[i - 1], e): - add_kurt(values[j], &nobs, &x, &xx, &xxx, &xxxx) + output = np.empty(N, dtype=float) - # calculate deletes - for j in range(start[i - 1], s): - remove_kurt(values[j], &nobs, &x, &xx, &xxx, &xxxx) + with nogil: - output[i] = calc_kurt(minp, nobs, x, xx, xxx, xxxx) + for i in range(0, N): - else: + s = start[i] + e = end[i] - with nogil: + # Over the first window, observations can only be added + # never removed + if i == 0: - for i in range(minp - 1): - add_kurt(values[i], &nobs, &x, &xx, &xxx, &xxxx) - output[i] = NaN + for j in range(s, e): + add_kurt(values[j], &nobs, &x, &xx, &xxx, &xxxx) - for i in range(minp - 1, N): - add_kurt(values[i], &nobs, &x, &xx, &xxx, &xxxx) + else: - if i > win - 1: - prev = values[i - win] - remove_kurt(prev, &nobs, &x, &xx, &xxx, &xxxx) + # After the first window, observations can both be added + # and removed - output[i] = calc_kurt(minp, nobs, x, xx, xxx, xxxx) + # calculate adds + for j in range(end[i - 1], e): + add_kurt(values[j], &nobs, &x, &xx, &xxx, &xxxx) + + # calculate deletes + for j in range(start[i - 1], s): + remove_kurt(values[j], &nobs, &x, &xx, &xxx, &xxxx) + + output[i] = calc_kurt(minp, nobs, x, xx, xxx, xxxx) return output @@ -1076,31 +821,26 @@ def roll_kurt(ndarray[float64_t] values, int64_t win, int64_t minp, # Rolling median, min, max -def roll_median_c(ndarray[float64_t] values, int64_t win, int64_t minp, - object index, object closed): +def roll_median_c(ndarray[float64_t] values, ndarray[int64_t] start, + ndarray[int64_t] end, int64_t minp, int64_t win): cdef: float64_t val, res, prev - bint err = 0, is_variable + bint err = 0 int ret = 0 skiplist_t *sl Py_ssize_t i, j - int64_t nobs = 0, N, s, e + int64_t nobs = 0, N = len(values), s, e int midpoint - int64_t[:] start, end ndarray[float64_t] output # we use the Fixed/Variable Indexer here as the # actual skiplist ops outweigh any window computation costs - start, end, N, win, minp, is_variable = get_window_indexer( - values, win, - minp, index, closed, - use_mock=False) output = np.empty(N, dtype=float) - if win == 0: + if win == 0 or (end - start).max() == 0: output[:] = NaN return output - + win = (end - start).max() sl = skiplist_init(win) if sl == NULL: raise MemoryError("skiplist_init failed") @@ -1209,76 +949,89 @@ cdef inline numeric calc_mm(int64_t minp, Py_ssize_t nobs, return result -def roll_max(ndarray[numeric] values, int64_t win, int64_t minp, - object index, object closed): +def roll_max_fixed(ndarray[float64_t] values, ndarray[int64_t] start, + ndarray[int64_t] end, int64_t minp, int64_t win): """ Moving max of 1d array of any numeric type along axis=0 ignoring NaNs. Parameters ---------- - values: numpy array - window: int, size of rolling window - minp: if number of observations in window + values : np.ndarray[np.float64] + window : int, size of rolling window + minp : if number of observations in window is below this, output a NaN - index: ndarray, optional + index : ndarray, optional index for window computation - closed: 'right', 'left', 'both', 'neither' + closed : 'right', 'left', 'both', 'neither' make the interval closed on the right, left, both or neither endpoints """ - return _roll_min_max(values, win, minp, index, closed=closed, is_max=1) + return _roll_min_max_fixed(values, start, end, minp, win, is_max=1) -def roll_min(ndarray[numeric] values, int64_t win, int64_t minp, - object index, object closed): +def roll_max_variable(ndarray[float64_t] values, ndarray[int64_t] start, + ndarray[int64_t] end, int64_t minp): """ Moving max of 1d array of any numeric type along axis=0 ignoring NaNs. Parameters ---------- - values: numpy array - window: int, size of rolling window - minp: if number of observations in window + values : np.ndarray[np.float64] + window : int, size of rolling window + minp : if number of observations in window is below this, output a NaN - index: ndarray, optional + index : ndarray, optional index for window computation + closed : 'right', 'left', 'both', 'neither' + make the interval closed on the right, left, + both or neither endpoints """ - return _roll_min_max(values, win, minp, index, is_max=0, closed=closed) + return _roll_min_max_variable(values, start, end, minp, is_max=1) -cdef _roll_min_max(ndarray[numeric] values, int64_t win, int64_t minp, - object index, object closed, bint is_max): +def roll_min_fixed(ndarray[float64_t] values, ndarray[int64_t] start, + ndarray[int64_t] end, int64_t minp, int64_t win): """ - Moving min/max of 1d array of any numeric type along axis=0 - ignoring NaNs. + Moving max of 1d array of any numeric type along axis=0 ignoring NaNs. + + Parameters + ---------- + values : np.ndarray[np.float64] + window : int, size of rolling window + minp : if number of observations in window + is below this, output a NaN + index : ndarray, optional + index for window computation """ - cdef: - ndarray[int64_t] starti, endi - int64_t N - bint is_variable + return _roll_min_max_fixed(values, start, end, minp, win, is_max=0) - starti, endi, N, win, minp, is_variable = get_window_indexer( - values, win, - minp, index, closed) - if is_variable: - return _roll_min_max_variable(values, starti, endi, N, win, minp, - is_max) - else: - return _roll_min_max_fixed(values, N, win, minp, is_max) +def roll_min_variable(ndarray[float64_t] values, ndarray[int64_t] start, + ndarray[int64_t] end, int64_t minp): + """ + Moving max of 1d array of any numeric type along axis=0 ignoring NaNs. + + Parameters + ---------- + values : np.ndarray[np.float64] + window : int, size of rolling window + minp : if number of observations in window + is below this, output a NaN + index : ndarray, optional + index for window computation + """ + return _roll_min_max_variable(values, start, end, minp, is_max=0) cdef _roll_min_max_variable(ndarray[numeric] values, ndarray[int64_t] starti, ndarray[int64_t] endi, - int64_t N, - int64_t win, int64_t minp, bint is_max): cdef: numeric ai int64_t i, close_offset, curr_win_size - Py_ssize_t nobs = 0 + Py_ssize_t nobs = 0, N = len(values) deque Q[int64_t] # min/max always the front deque W[int64_t] # track the whole window for nobs compute ndarray[float64_t, ndim=1] output @@ -1353,15 +1106,16 @@ cdef _roll_min_max_variable(ndarray[numeric] values, cdef _roll_min_max_fixed(ndarray[numeric] values, - int64_t N, - int64_t win, + ndarray[int64_t] starti, + ndarray[int64_t] endi, int64_t minp, + int64_t win, bint is_max): cdef: numeric ai bint should_replace int64_t i, removed, window_i, - Py_ssize_t nobs = 0 + Py_ssize_t nobs = 0, N = len(values) int64_t* death numeric* ring numeric* minvalue @@ -1457,8 +1211,8 @@ interpolation_types = { } -def roll_quantile(ndarray[float64_t, cast=True] values, int64_t win, - int64_t minp, object index, object closed, +def roll_quantile(ndarray[float64_t, cast=True] values, ndarray[int64_t] start, + ndarray[int64_t] end, int64_t minp, int64_t win, float64_t quantile, str interpolation): """ O(N log(window)) implementation using skip list @@ -1466,10 +1220,8 @@ def roll_quantile(ndarray[float64_t, cast=True] values, int64_t win, cdef: float64_t val, prev, midpoint, idx_with_fraction skiplist_t *skiplist - int64_t nobs = 0, i, j, s, e, N + int64_t nobs = 0, i, j, s, e, N = len(values) Py_ssize_t idx - bint is_variable - int64_t[:] start, end ndarray[float64_t] output float64_t vlow, vhigh InterpolationType interpolation_type @@ -1485,16 +1237,12 @@ def roll_quantile(ndarray[float64_t, cast=True] values, int64_t win, # we use the Fixed/Variable Indexer here as the # actual skiplist ops outweigh any window computation costs - start, end, N, win, minp, is_variable = get_window_indexer( - values, win, - minp, index, closed, - use_mock=False) output = np.empty(N, dtype=float) - if win == 0: + if win == 0 or (end - start).max() == 0: output[:] = NaN return output - + win = (end - start).max() skiplist = skiplist_init(win) if skiplist == NULL: raise MemoryError("skiplist_init failed") @@ -1575,18 +1323,17 @@ def roll_quantile(ndarray[float64_t, cast=True] values, int64_t win, return output -def roll_generic(object obj, - int64_t win, int64_t minp, object index, object closed, - int offset, object func, bint raw, - object args, object kwargs): +def roll_generic_fixed(object obj, + ndarray[int64_t] start, ndarray[int64_t] end, + int64_t minp, int64_t win, + int offset, object func, bint raw, + object args, object kwargs): cdef: ndarray[float64_t] output, counts, bufarr ndarray[float64_t, cast=True] arr float64_t *buf float64_t *oldbuf - int64_t nobs = 0, i, j, s, e, N - bint is_variable - int64_t[:] start, end + int64_t nobs = 0, i, j, s, e, N = len(start) n = len(obj) if n == 0: @@ -1599,36 +1346,13 @@ def roll_generic(object obj, if not arr.flags.c_contiguous: arr = arr.copy('C') - counts = roll_sum(np.concatenate([np.isfinite(arr).astype(float), - np.array([0.] * offset)]), - win, minp, index, closed)[offset:] - - start, end, N, win, minp, is_variable = get_window_indexer(arr, win, - minp, index, - closed, - floor=0) + counts = roll_sum_fixed(np.concatenate([np.isfinite(arr).astype(float), + np.array([0.] * offset)]), + start, end, minp, win)[offset:] output = np.empty(N, dtype=float) - if is_variable: - # variable window arr or series - - if offset != 0: - raise ValueError("unable to roll_generic with a non-zero offset") - - for i in range(0, N): - s = start[i] - e = end[i] - - if counts[i] >= minp: - if raw: - output[i] = func(arr[s:e], *args, **kwargs) - else: - output[i] = func(obj.iloc[s:e], *args, **kwargs) - else: - output[i] = NaN - - elif not raw: + if not raw: # series for i in range(N): if counts[i] >= minp: @@ -1672,6 +1396,53 @@ def roll_generic(object obj, return output +def roll_generic_variable(object obj, + ndarray[int64_t] start, ndarray[int64_t] end, + int64_t minp, + int offset, object func, bint raw, + object args, object kwargs): + cdef: + ndarray[float64_t] output, counts, bufarr + ndarray[float64_t, cast=True] arr + float64_t *buf + float64_t *oldbuf + int64_t nobs = 0, i, j, s, e, N = len(start) + + n = len(obj) + if n == 0: + return obj + + arr = np.asarray(obj) + + # ndarray input + if raw: + if not arr.flags.c_contiguous: + arr = arr.copy('C') + + counts = roll_sum_variable(np.concatenate([np.isfinite(arr).astype(float), + np.array([0.] * offset)]), + start, end, minp)[offset:] + + output = np.empty(N, dtype=float) + + if offset != 0: + raise ValueError("unable to roll_generic with a non-zero offset") + + for i in range(0, N): + s = start[i] + e = end[i] + + if counts[i] >= minp: + if raw: + output[i] = func(arr[s:e], *args, **kwargs) + else: + output[i] = func(obj.iloc[s:e], *args, **kwargs) + else: + output[i] = NaN + + return output + + # ---------------------------------------------------------------------- # Rolling sum and mean for weighted window diff --git a/pandas/_libs/window_indexer.pyx b/pandas/_libs/window_indexer.pyx new file mode 100644 index 0000000000000..8f49a8b9462d3 --- /dev/null +++ b/pandas/_libs/window_indexer.pyx @@ -0,0 +1,165 @@ +# cython: boundscheck=False, wraparound=False, cdivision=True + +import numpy as np +from numpy cimport ndarray, int64_t + +# ---------------------------------------------------------------------- +# The indexer objects for rolling +# These define start/end indexers to compute offsets + + +class MockFixedWindowIndexer: + """ + + We are just checking parameters of the indexer, + and returning a consistent API with fixed/variable + indexers. + + Parameters + ---------- + values: ndarray + values data array + win: int64_t + window size + index: object + index of the values + closed: string + closed behavior + """ + def __init__(self, ndarray values, int64_t win, object closed, object index=None): + + self.start = np.empty(0, dtype='int64') + self.end = np.empty(0, dtype='int64') + + def get_window_bounds(self): + return self.start, self.end + + +class FixedWindowIndexer: + """ + create a fixed length window indexer object + that has start & end, that point to offsets in + the index object; these are defined based on the win + arguments + + Parameters + ---------- + values: ndarray + values data array + win: int64_t + window size + index: object + index of the values + closed: string + closed behavior + """ + def __init__(self, ndarray values, int64_t win, object closed, object index=None): + cdef: + ndarray[int64_t, ndim=1] start_s, start_e, end_s, end_e + int64_t N = len(values) + + start_s = np.zeros(win, dtype='int64') + start_e = np.arange(win, N, dtype='int64') - win + 1 + self.start = np.concatenate([start_s, start_e])[:N] + + end_s = np.arange(win, dtype='int64') + 1 + end_e = start_e + win + self.end = np.concatenate([end_s, end_e])[:N] + + def get_window_bounds(self): + return self.start, self.end + + +class VariableWindowIndexer: + """ + create a variable length window indexer object + that has start & end, that point to offsets in + the index object; these are defined based on the win + arguments + + Parameters + ---------- + values: ndarray + values data array + win: int64_t + window size + index: ndarray + index of the values + closed: string + closed behavior + """ + def __init__(self, ndarray values, int64_t win, object closed, ndarray index): + cdef: + bint left_closed = False + bint right_closed = False + int64_t N = len(index) + + # if windows is variable, default is 'right', otherwise default is 'both' + if closed is None: + closed = 'right' if index is not None else 'both' + + if closed in ['right', 'both']: + right_closed = True + + if closed in ['left', 'both']: + left_closed = True + + self.start, self.end = self.build(index, win, left_closed, right_closed, N) + + @staticmethod + def build(const int64_t[:] index, int64_t win, bint left_closed, + bint right_closed, int64_t N): + + cdef: + ndarray[int64_t] start, end + int64_t start_bound, end_bound + Py_ssize_t i, j + + start = np.empty(N, dtype='int64') + start.fill(-1) + end = np.empty(N, dtype='int64') + end.fill(-1) + + start[0] = 0 + + # right endpoint is closed + if right_closed: + end[0] = 1 + # right endpoint is open + else: + end[0] = 0 + + with nogil: + + # start is start of slice interval (including) + # end is end of slice interval (not including) + for i in range(1, N): + end_bound = index[i] + start_bound = index[i] - win + + # left endpoint is closed + if left_closed: + start_bound -= 1 + + # advance the start bound until we are + # within the constraint + start[i] = i + for j in range(start[i - 1], i): + if index[j] > start_bound: + start[i] = j + break + + # end bound is previous end + # or current index + if index[end[i - 1]] <= end_bound: + end[i] = i + 1 + else: + end[i] = end[i - 1] + + # right endpoint is open + if not right_closed: + end[i] -= 1 + return start, end + + def get_window_bounds(self): + return self.start, self.end diff --git a/pandas/core/window/common.py b/pandas/core/window/common.py index 3fd567f97edae..453fd12495543 100644 --- a/pandas/core/window/common.py +++ b/pandas/core/window/common.py @@ -1,5 +1,6 @@ """Common utility functions for rolling operations""" from collections import defaultdict +from typing import Callable, Optional import warnings import numpy as np @@ -62,12 +63,20 @@ def __init__(self, obj, *args, **kwargs): cov = _dispatch("cov", other=None, pairwise=None) def _apply( - self, func, name=None, window=None, center=None, check_minp=None, **kwargs + self, + func: Callable, + center: bool, + require_min_periods: int = 0, + floor: int = 1, + is_weighted: bool = False, + name: Optional[str] = None, + **kwargs, ): """ Dispatch to apply; we are stripping all of the _apply kwargs and performing the original function call on the grouped object. """ + kwargs.pop("floor", None) # TODO: can we de-duplicate with _dispatch? def f(x, name=name, *args): @@ -267,6 +276,44 @@ def _use_window(minp, window): return minp +def calculate_min_periods( + window: int, + min_periods: Optional[int], + num_values: int, + required_min_periods: int, + floor: int, +) -> int: + """ + Calculates final minimum periods value for rolling aggregations. + + Parameters + ---------- + window : passed window value + min_periods : passed min periods value + num_values : total number of values + required_min_periods : required min periods per aggregation function + floor : required min periods per aggregation function + + Returns + ------- + min_periods : int + """ + if min_periods is None: + min_periods = window + else: + min_periods = max(required_min_periods, min_periods) + if min_periods > window: + raise ValueError( + "min_periods {min_periods} must be <= " + "window {window}".format(min_periods=min_periods, window=window) + ) + elif min_periods > num_values: + min_periods = num_values + 1 + elif min_periods < 0: + raise ValueError("min_periods must be >= 0") + return max(min_periods, floor) + + def _zsqrt(x): with np.errstate(all="ignore"): result = np.sqrt(x) diff --git a/pandas/core/window/rolling.py b/pandas/core/window/rolling.py index bec350f6b7d8b..fd2e8aa2ad02f 100644 --- a/pandas/core/window/rolling.py +++ b/pandas/core/window/rolling.py @@ -3,6 +3,7 @@ similar to how we have a Groupby object. """ from datetime import timedelta +from functools import partial from textwrap import dedent from typing import Callable, Dict, List, Optional, Set, Tuple, Union import warnings @@ -10,6 +11,7 @@ import numpy as np import pandas._libs.window as libwindow +import pandas._libs.window_indexer as libwindow_indexer from pandas.compat._optional import import_optional_dependency from pandas.compat.numpy import function as nv from pandas.util._decorators import Appender, Substitution, cache_readonly @@ -43,10 +45,10 @@ _doc_template, _flex_binary_moment, _offset, - _require_min_periods, _shared_docs, _use_window, _zsqrt, + calculate_min_periods, ) @@ -366,39 +368,55 @@ def _center_window(self, result, window) -> np.ndarray: result = np.copy(result[tuple(lead_indexer)]) return result - def _get_roll_func( - self, cfunc: Callable, check_minp: Callable, index: np.ndarray, **kwargs - ) -> Callable: + def _get_roll_func(self, func_name: str) -> Callable: """ Wrap rolling function to check values passed. Parameters ---------- - cfunc : callable + func_name : str Cython function used to calculate rolling statistics - check_minp : callable - function to check minimum period parameter - index : ndarray - used for variable window Returns ------- func : callable """ + window_func = getattr(libwindow, func_name, None) + if window_func is None: + raise ValueError( + "we do not support this function " + "in libwindow.{func_name}".format(func_name=func_name) + ) + return window_func - def func(arg, window, min_periods=None, closed=None): - minp = check_minp(min_periods, window) - return cfunc(arg, window, minp, index, closed, **kwargs) + def _get_cython_func_type(self, func): + """ + Return a variable or fixed cython function type. - return func + Variable algorithms do not use window while fixed do. + """ + if self.is_freq_type: + return self._get_roll_func("{}_variable".format(func)) + return partial( + self._get_roll_func("{}_fixed".format(func)), win=self._get_window() + ) + + def _get_window_indexer(self): + """ + Return an indexer class that will compute the window start and end bounds + """ + if self.is_freq_type: + return libwindow_indexer.VariableWindowIndexer + return libwindow_indexer.FixedWindowIndexer def _apply( self, - func: Union[str, Callable], + func: Callable, + center: bool, + require_min_periods: int = 0, + floor: int = 1, + is_weighted: bool = False, name: Optional[str] = None, - window: Optional[Union[int, str]] = None, - center: Optional[bool] = None, - check_minp: Optional[Callable] = None, **kwargs, ): """ @@ -408,13 +426,13 @@ def _apply( Parameters ---------- - func : str/callable to apply - name : str, optional - name of this function - window : int/str, default to _get_window() - window length or offset - center : bool, default to self.center - check_minp : function, default to _use_window + func : callable function to apply + center : bool + require_min_periods : int + floor: int + is_weighted + name: str, + compatibility with groupby.rolling **kwargs additional arguments for rolling function and window function @@ -422,20 +440,13 @@ def _apply( ------- y : type of input """ - - if center is None: - center = self.center - - if check_minp is None: - check_minp = _use_window - - if window is None: - win_type = self._get_win_type(kwargs) - window = self._get_window(win_type=win_type) + win_type = self._get_win_type(kwargs) + window = self._get_window(win_type=win_type) blocks, obj = self._create_blocks() block_list = list(blocks) index_as_array = self._get_index() + window_indexer = self._get_window_indexer() results = [] exclude = [] # type: List[Scalar] @@ -455,36 +466,27 @@ def _apply( results.append(values.copy()) continue - # if we have a string function name, wrap it - if isinstance(func, str): - cfunc = getattr(libwindow, func, None) - if cfunc is None: - raise ValueError( - "we do not support this function " - "in libwindow.{func}".format(func=func) - ) - - func = self._get_roll_func(cfunc, check_minp, index_as_array, **kwargs) - # calculation function - if center: - offset = _offset(window, center) - additional_nans = np.array([np.NaN] * offset) + offset = _offset(window, center) if center else 0 + additional_nans = np.array([np.nan] * offset) + + if not is_weighted: def calc(x): - return func( - np.concatenate((x, additional_nans)), - window, - min_periods=self.min_periods, - closed=self.closed, + x = np.concatenate((x, additional_nans)) + min_periods = calculate_min_periods( + window, self.min_periods, len(x), require_min_periods, floor ) + start, end = window_indexer( + x, window, self.closed, index_as_array + ).get_window_bounds() + return func(x, start, end, min_periods) else: def calc(x): - return func( - x, window, min_periods=self.min_periods, closed=self.closed - ) + x = np.concatenate((x, additional_nans)) + return func(x, window, self.min_periods) with np.errstate(all="ignore"): if values.ndim > 1: @@ -995,8 +997,8 @@ def _get_window( # GH #15662. `False` makes symmetric window, rather than periodic. return sig.get_window(win_type, window, False).astype(float) - def _get_roll_func( - self, cfunc: Callable, check_minp: Callable, index: np.ndarray, **kwargs + def _get_weighted_roll_func( + self, cfunc: Callable, check_minp: Callable, **kwargs ) -> Callable: def func(arg, window, min_periods=None, closed=None): minp = check_minp(min_periods, len(window)) @@ -1070,25 +1072,38 @@ def aggregate(self, func, *args, **kwargs): @Appender(_shared_docs["sum"]) def sum(self, *args, **kwargs): nv.validate_window_func("sum", args, kwargs) - return self._apply("roll_weighted_sum", **kwargs) + window_func = self._get_roll_func("roll_weighted_sum") + window_func = self._get_weighted_roll_func(window_func, _use_window) + return self._apply( + window_func, center=self.center, is_weighted=True, name="sum", **kwargs + ) @Substitution(name="window") @Appender(_shared_docs["mean"]) def mean(self, *args, **kwargs): nv.validate_window_func("mean", args, kwargs) - return self._apply("roll_weighted_mean", **kwargs) + window_func = self._get_roll_func("roll_weighted_mean") + window_func = self._get_weighted_roll_func(window_func, _use_window) + return self._apply( + window_func, center=self.center, is_weighted=True, name="mean", **kwargs + ) @Substitution(name="window", versionadded="\n.. versionadded:: 1.0.0\n") @Appender(_shared_docs["var"]) def var(self, ddof=1, *args, **kwargs): nv.validate_window_func("var", args, kwargs) - return self._apply("roll_weighted_var", ddof=ddof, **kwargs) + window_func = partial(self._get_roll_func("roll_weighted_var"), ddof=ddof) + window_func = self._get_weighted_roll_func(window_func, _use_window) + kwargs.pop("name", None) + return self._apply( + window_func, center=self.center, is_weighted=True, name="var", **kwargs + ) @Substitution(name="window", versionadded="\n.. versionadded:: 1.0.0\n") @Appender(_shared_docs["std"]) def std(self, ddof=1, *args, **kwargs): nv.validate_window_func("std", args, kwargs) - return _zsqrt(self.var(ddof=ddof, **kwargs)) + return _zsqrt(self.var(ddof=ddof, name="std", **kwargs)) class _Rolling(_Window): @@ -1203,9 +1218,9 @@ def apply(self, func, raw=None, args=(), kwargs={}): from pandas import Series kwargs.pop("_level", None) + kwargs.pop("floor", None) window = self._get_window() offset = _offset(window, self.center) - index_as_array = self._get_index() # TODO: default is for backward compat # change to False in the future @@ -1221,28 +1236,31 @@ def apply(self, func, raw=None, args=(), kwargs={}): ) raw = True - def f(arg, window, min_periods, closed): - minp = _use_window(min_periods, window) + window_func = partial( + self._get_cython_func_type("roll_generic"), + args=args, + kwargs=kwargs, + raw=raw, + offset=offset, + func=func, + ) + + def apply_func(values, begin, end, min_periods, raw=raw): if not raw: - arg = Series(arg, index=self.obj.index) - return libwindow.roll_generic( - arg, - window, - minp, - index_as_array, - closed, - offset, - func, - raw, - args, - kwargs, - ) + values = Series(values, index=self.obj.index) + return window_func(values, begin, end, min_periods) - return self._apply(f, func, args=args, kwargs=kwargs, center=False, raw=raw) + # TODO: Why do we always pass center=False? + # name=func for WindowGroupByMixin._apply + return self._apply(apply_func, center=False, floor=0, name=func) def sum(self, *args, **kwargs): nv.validate_window_func("sum", args, kwargs) - return self._apply("roll_sum", "sum", **kwargs) + window_func = self._get_cython_func_type("roll_sum") + kwargs.pop("floor", None) + return self._apply( + window_func, center=self.center, floor=0, name="sum", **kwargs + ) _shared_docs["max"] = dedent( """ @@ -1257,7 +1275,8 @@ def sum(self, *args, **kwargs): def max(self, *args, **kwargs): nv.validate_window_func("max", args, kwargs) - return self._apply("roll_max", "max", **kwargs) + window_func = self._get_cython_func_type("roll_max") + return self._apply(window_func, center=self.center, name="max", **kwargs) _shared_docs["min"] = dedent( """ @@ -1298,11 +1317,13 @@ def max(self, *args, **kwargs): def min(self, *args, **kwargs): nv.validate_window_func("min", args, kwargs) - return self._apply("roll_min", "min", **kwargs) + window_func = self._get_cython_func_type("roll_min") + return self._apply(window_func, center=self.center, name="min", **kwargs) def mean(self, *args, **kwargs): nv.validate_window_func("mean", args, kwargs) - return self._apply("roll_mean", "mean", **kwargs) + window_func = self._get_cython_func_type("roll_mean") + return self._apply(window_func, center=self.center, name="mean", **kwargs) _shared_docs["median"] = dedent( """ @@ -1342,27 +1363,40 @@ def mean(self, *args, **kwargs): ) def median(self, **kwargs): - return self._apply("roll_median_c", "median", **kwargs) + window_func = self._get_roll_func("roll_median_c") + window_func = partial(window_func, win=self._get_window()) + return self._apply(window_func, center=self.center, name="median", **kwargs) def std(self, ddof=1, *args, **kwargs): nv.validate_window_func("std", args, kwargs) - window = self._get_window() - index_as_array = self._get_index() + kwargs.pop("require_min_periods", None) + window_func = self._get_cython_func_type("roll_var") - def f(arg, *args, **kwargs): - minp = _require_min_periods(1)(self.min_periods, window) - return _zsqrt( - libwindow.roll_var(arg, window, minp, index_as_array, self.closed, ddof) - ) + def zsqrt_func(values, begin, end, min_periods): + return _zsqrt(window_func(values, begin, end, min_periods, ddof=ddof)) + # ddof passed again for compat with groupby.rolling return self._apply( - f, "std", check_minp=_require_min_periods(1), ddof=ddof, **kwargs + zsqrt_func, + center=self.center, + require_min_periods=1, + name="std", + ddof=ddof, + **kwargs, ) def var(self, ddof=1, *args, **kwargs): nv.validate_window_func("var", args, kwargs) + kwargs.pop("require_min_periods", None) + window_func = partial(self._get_cython_func_type("roll_var"), ddof=ddof) + # ddof passed again for compat with groupby.rolling return self._apply( - "roll_var", "var", check_minp=_require_min_periods(1), ddof=ddof, **kwargs + window_func, + center=self.center, + require_min_periods=1, + name="var", + ddof=ddof, + **kwargs, ) _shared_docs[ @@ -1377,8 +1411,14 @@ def var(self, ddof=1, *args, **kwargs): """ def skew(self, **kwargs): + window_func = self._get_cython_func_type("roll_skew") + kwargs.pop("require_min_periods", None) return self._apply( - "roll_skew", "skew", check_minp=_require_min_periods(3), **kwargs + window_func, + center=self.center, + require_min_periods=3, + name="skew", + **kwargs, ) _shared_docs["kurt"] = dedent( @@ -1414,8 +1454,14 @@ def skew(self, **kwargs): ) def kurt(self, **kwargs): + window_func = self._get_cython_func_type("roll_kurt") + kwargs.pop("require_min_periods", None) return self._apply( - "roll_kurt", "kurt", check_minp=_require_min_periods(4), **kwargs + window_func, + center=self.center, + require_min_periods=4, + name="kurt", + **kwargs, ) _shared_docs["quantile"] = dedent( @@ -1475,33 +1521,22 @@ def kurt(self, **kwargs): ) def quantile(self, quantile, interpolation="linear", **kwargs): - window = self._get_window() - index_as_array = self._get_index() - - def f(arg, *args, **kwargs): - minp = _use_window(self.min_periods, window) - if quantile == 1.0: - return libwindow.roll_max( - arg, window, minp, index_as_array, self.closed - ) - elif quantile == 0.0: - return libwindow.roll_min( - arg, window, minp, index_as_array, self.closed - ) - else: - return libwindow.roll_quantile( - arg, - window, - minp, - index_as_array, - self.closed, - quantile, - interpolation, - ) + if quantile == 1.0: + window_func = self._get_cython_func_type("roll_max") + elif quantile == 0.0: + window_func = self._get_cython_func_type("roll_min") + else: + window_func = partial( + self._get_roll_func("roll_quantile"), + win=self._get_window(), + quantile=quantile, + interpolation=interpolation, + ) - return self._apply( - f, "quantile", quantile=quantile, interpolation=interpolation, **kwargs - ) + # Pass through for groupby.rolling + kwargs["quantile"] = quantile + kwargs["interpolation"] = interpolation + return self._apply(window_func, center=self.center, name="quantile", **kwargs) _shared_docs[ "cov" @@ -1856,7 +1891,8 @@ def count(self): # different impl for freq counting if self.is_freq_type: - return self._apply("roll_count", "count") + window_func = self._get_roll_func("roll_count") + return self._apply(window_func, center=self.center, name="count") return super().count() diff --git a/setup.py b/setup.py index 545765ecb114d..0915b6aba113a 100755 --- a/setup.py +++ b/setup.py @@ -344,6 +344,7 @@ class CheckSDist(sdist_class): "pandas/_libs/tslibs/resolution.pyx", "pandas/_libs/tslibs/parsing.pyx", "pandas/_libs/tslibs/tzconversion.pyx", + "pandas/_libs/window_indexer.pyx", "pandas/_libs/writers.pyx", "pandas/io/sas/sas.pyx", ] @@ -683,6 +684,7 @@ def srcpath(name=None, suffix=".pyx", subdir="src"): }, "_libs.testing": {"pyxfile": "_libs/testing"}, "_libs.window": {"pyxfile": "_libs/window", "language": "c++", "suffix": ".cpp"}, + "_libs.window_indexer": {"pyxfile": "_libs/window_indexer"}, "_libs.writers": {"pyxfile": "_libs/writers"}, "io.sas._sas": {"pyxfile": "io/sas/sas"}, "io.msgpack._packer": { From 32d541eb126b93851dd38a77e02dfd2cda096b12 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Thu, 21 Nov 2019 05:00:32 -0800 Subject: [PATCH 147/185] CLN: io.pytables (#29756) --- pandas/io/pytables.py | 43 +++++++++++++++---------------------------- 1 file changed, 15 insertions(+), 28 deletions(-) diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index 7c447cbf78677..9d2642ae414d0 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -1604,10 +1604,11 @@ class TableIterator: """ chunksize: Optional[int] + store: HDFStore def __init__( self, - store, + store: HDFStore, s, func, where, @@ -1616,7 +1617,7 @@ def __init__( stop=None, iterator: bool = False, chunksize=None, - auto_close=False, + auto_close: bool = False, ): self.store = store self.s = s @@ -1772,9 +1773,6 @@ def set_pos(self, pos: int): self.typ._v_pos = pos return self - def set_table(self, table): - self.table = table - def __repr__(self) -> str: temp = tuple( map(pprint_thing, (self.name, self.cname, self.axis, self.pos, self.kind)) @@ -1800,8 +1798,7 @@ def __ne__(self, other) -> bool: def is_indexed(self) -> bool: """ return whether I am an indexed column """ if not hasattr(self.table, "cols"): - # e.g. if self.set_table hasn't been called yet, self.table - # will be None. + # e.g. if infer hasn't been called yet, self.table will be None. return False # GH#29692 mypy doesn't recognize self.table as having a "cols" attribute # 'error: "None" has no attribute "cols"' @@ -1815,7 +1812,7 @@ def infer(self, handler): """infer this column from the table: create and return a new object""" table = handler.table new_self = self.copy() - new_self.set_table(table) + new_self.table = table new_self.get_attr() new_self.read_metadata(handler) return new_self @@ -1896,7 +1893,7 @@ def validate_names(self): pass def validate_and_set(self, handler: "AppendableTable", append: bool): - self.set_table(handler.table) + self.table = handler.table self.validate_col() self.validate_attr(append) self.validate_metadata(handler) @@ -2941,13 +2938,8 @@ def read_index_node( data = node[start:stop] # If the index was an empty array write_array_empty() will # have written a sentinel. Here we relace it with the original. - if "shape" in node._v_attrs and self._is_empty_array( - getattr(node._v_attrs, "shape") - ): - data = np.empty( - getattr(node._v_attrs, "shape"), - dtype=getattr(node._v_attrs, "value_type"), - ) + if "shape" in node._v_attrs and self._is_empty_array(node._v_attrs.shape): + data = np.empty(node._v_attrs.shape, dtype=node._v_attrs.value_type,) kind = _ensure_decoded(node._v_attrs.kind) name = None @@ -3126,7 +3118,7 @@ class SeriesFixed(GenericFixed): @property def shape(self): try: - return (len(getattr(self.group, "values")),) + return (len(self.group.values),) except (TypeError, AttributeError): return None @@ -3161,7 +3153,7 @@ def shape(self): items += shape[0] # data shape - node = getattr(self.group, "block0_values") + node = self.group.block0_values shape = getattr(node, "shape", None) if shape is not None: shape = list(shape[0 : (ndim - 1)]) @@ -3481,10 +3473,6 @@ def read_metadata(self, key): return self.parent.select(self._get_metadata_path(key)) return None - def set_info(self): - """ update our table index info """ - self.attrs.info = self.info - def set_attrs(self): """ set our table type & indexables """ self.attrs.table_type = str(self.table_type) @@ -3497,7 +3485,7 @@ def set_attrs(self): self.attrs.errors = self.errors self.attrs.levels = self.levels self.attrs.metadata = self.metadata - self.set_info() + self.attrs.info = self.info def get_attrs(self): """ retrieve our attributes """ @@ -4230,7 +4218,7 @@ def write( # table = self.table # update my info - self.set_info() + self.attrs.info = self.info # validate the axes and set the kinds for a in self.axes: @@ -4964,6 +4952,7 @@ def _unconvert_string_array(data, nan_rep=None, encoding=None, errors="strict"): def _maybe_convert(values: np.ndarray, val_kind, encoding, errors): + val_kind = _ensure_decoded(val_kind) if _need_convert(val_kind): conv = _get_converter(val_kind, encoding, errors) # conv = np.frompyfunc(conv, 1, 1) @@ -4971,8 +4960,7 @@ def _maybe_convert(values: np.ndarray, val_kind, encoding, errors): return values -def _get_converter(kind, encoding, errors): - kind = _ensure_decoded(kind) +def _get_converter(kind: str, encoding, errors): if kind == "datetime64": return lambda x: np.asarray(x, dtype="M8[ns]") elif kind == "datetime": @@ -4980,11 +4968,10 @@ def _get_converter(kind, encoding, errors): elif kind == "string": return lambda x: _unconvert_string_array(x, encoding=encoding, errors=errors) else: # pragma: no cover - raise ValueError("invalid kind {kind}".format(kind=kind)) + raise ValueError(f"invalid kind {kind}") def _need_convert(kind) -> bool: - kind = _ensure_decoded(kind) if kind in ("datetime", "datetime64", "string"): return True return False From 510bdddbb8434ba50b5c2d388db3094bb5cba0ed Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Thu, 21 Nov 2019 05:02:30 -0800 Subject: [PATCH 148/185] TYP: annotate queryables (#29754) --- pandas/core/computation/ops.py | 4 +++- pandas/core/computation/pytables.py | 35 ++++++++++++++++++++++------- pandas/core/computation/scope.py | 5 +++-- pandas/io/pytables.py | 3 ++- 4 files changed, 35 insertions(+), 12 deletions(-) diff --git a/pandas/core/computation/ops.py b/pandas/core/computation/ops.py index 524013ceef5ff..41d7f96f5e96d 100644 --- a/pandas/core/computation/ops.py +++ b/pandas/core/computation/ops.py @@ -55,7 +55,7 @@ class UndefinedVariableError(NameError): NameError subclass for local variables. """ - def __init__(self, name, is_local): + def __init__(self, name, is_local: bool): if is_local: msg = "local variable {0!r} is not defined" else: @@ -69,6 +69,8 @@ def __new__(cls, name, env, side=None, encoding=None): supr_new = super(Term, klass).__new__ return supr_new(klass) + is_local: bool + def __init__(self, name, env, side=None, encoding=None): # name is a str for Term, but may be something else for subclasses self._name = name diff --git a/pandas/core/computation/pytables.py b/pandas/core/computation/pytables.py index ff7e713b3e71a..8dee273517f88 100644 --- a/pandas/core/computation/pytables.py +++ b/pandas/core/computation/pytables.py @@ -2,7 +2,7 @@ import ast from functools import partial -from typing import Any, Optional, Tuple +from typing import Any, Dict, Optional, Tuple import numpy as np @@ -24,17 +24,27 @@ class Scope(_scope.Scope): __slots__ = ("queryables",) - def __init__(self, level: int, global_dict=None, local_dict=None, queryables=None): + queryables: Dict[str, Any] + + def __init__( + self, + level: int, + global_dict=None, + local_dict=None, + queryables: Optional[Dict[str, Any]] = None, + ): super().__init__(level + 1, global_dict=global_dict, local_dict=local_dict) self.queryables = queryables or dict() class Term(ops.Term): + env: Scope + def __new__(cls, name, env, side=None, encoding=None): klass = Constant if not isinstance(name, str) else cls return object.__new__(klass) - def __init__(self, name, env, side=None, encoding=None): + def __init__(self, name, env: Scope, side=None, encoding=None): super().__init__(name, env, side=side, encoding=encoding) def _resolve_name(self): @@ -69,7 +79,10 @@ class BinOp(ops.BinOp): _max_selectors = 31 - def __init__(self, op, lhs, rhs, queryables, encoding): + op: str + queryables: Dict[str, Any] + + def __init__(self, op: str, lhs, rhs, queryables: Dict[str, Any], encoding): super().__init__(op, lhs, rhs) self.queryables = queryables self.encoding = encoding @@ -373,9 +386,6 @@ def prune(self, klass): return None -_op_classes = {"unary": UnaryOp} - - class ExprVisitor(BaseExprVisitor): const_type = Constant term_type = Term @@ -510,7 +520,16 @@ class Expr(expr.Expr): "major_axis>=20130101" """ - def __init__(self, where, queryables=None, encoding=None, scope_level: int = 0): + _visitor: Optional[ExprVisitor] + env: Scope + + def __init__( + self, + where, + queryables: Optional[Dict[str, Any]] = None, + encoding=None, + scope_level: int = 0, + ): where = _validate_where(where) diff --git a/pandas/core/computation/scope.py b/pandas/core/computation/scope.py index 2c5c687a44680..71aa885816670 100644 --- a/pandas/core/computation/scope.py +++ b/pandas/core/computation/scope.py @@ -9,6 +9,7 @@ import pprint import struct import sys +from typing import List import numpy as np @@ -203,7 +204,7 @@ def resolve(self, key, is_local): raise UndefinedVariableError(key, is_local) - def swapkey(self, old_key, new_key, new_value=None): + def swapkey(self, old_key: str, new_key: str, new_value=None): """ Replace a variable name, with a potentially new value. @@ -228,7 +229,7 @@ def swapkey(self, old_key, new_key, new_value=None): mapping[new_key] = new_value return - def _get_vars(self, stack, scopes): + def _get_vars(self, stack, scopes: List[str]): """ Get specifically scoped variables from a list of stack frames. diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index 9d2642ae414d0..b2d742fe05141 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -2040,6 +2040,7 @@ def convert( Table row number: the end of the sub-selection. Values larger than the underlying table's row count are normalized to that. """ + assert self.table is not None # for mypy assert self.table is not None @@ -3417,7 +3418,7 @@ def data_orientation(self): ) ) - def queryables(self): + def queryables(self) -> Dict[str, Any]: """ return a dict of the kinds allowable columns for this object """ # compute the values_axes queryables From 56984626182ffbe16afb7c291c912470938d65c5 Mon Sep 17 00:00:00 2001 From: Lucas Scarlato Astur Date: Thu, 21 Nov 2019 10:04:13 -0300 Subject: [PATCH 149/185] format replaced with f-strings (#29701) --- pandas/core/groupby/generic.py | 11 +++++----- pandas/core/groupby/groupby.py | 8 +++---- pandas/core/groupby/grouper.py | 39 +++++++++++----------------------- pandas/core/groupby/ops.py | 12 ++++------- 4 files changed, 24 insertions(+), 46 deletions(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 7d3bf3d3dcd2f..b62f2888949b9 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -301,8 +301,7 @@ def _aggregate_multiple_funcs(self, arg): obj = self if name in results: raise SpecificationError( - "Function names must be unique, found multiple named " - "{name}".format(name=name) + f"Function names must be unique, found multiple named {name}" ) # reset the cache so that we @@ -588,7 +587,7 @@ def nunique(self, dropna: bool = True) -> Series: try: sorter = np.lexsort((val, ids)) except TypeError: # catches object dtypes - msg = "val.dtype must be object, got {}".format(val.dtype) + msg = f"val.dtype must be object, got {val.dtype}" assert val.dtype == object, msg val, _ = algorithms.factorize(val, sort=False) sorter = np.lexsort((val, ids)) @@ -1550,8 +1549,8 @@ def filter(self, func, dropna=True, *args, **kwargs): else: # non scalars aren't allowed raise TypeError( - "filter function returned a {typ}, " - "but expected a scalar bool".format(typ=type(res).__name__) + f"filter function returned a {type(res).__name__}, " + "but expected a scalar bool" ) return self._apply_filter(indices, dropna) @@ -1950,7 +1949,7 @@ def _managle_lambda_list(aggfuncs: Sequence[Any]) -> Sequence[Any]: for aggfunc in aggfuncs: if com.get_callable_name(aggfunc) == "": aggfunc = partial(aggfunc) - aggfunc.__name__ = "".format(i) + aggfunc.__name__ = f"" i += 1 mangled_aggfuncs.append(aggfunc) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index f7282950498c5..21c085c775399 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -566,9 +566,7 @@ def __getattr__(self, attr): return self[attr] raise AttributeError( - "'{typ}' object has no attribute '{attr}'".format( - typ=type(self).__name__, attr=attr - ) + f"'{type(self).__name__}' object has no attribute '{attr}'" ) @Substitution( @@ -1783,7 +1781,7 @@ def nth(self, n: Union[int, List[int]], dropna: Optional[str] = None) -> DataFra raise ValueError( "For a DataFrame groupby, dropna must be " "either None, 'any' or 'all', " - "(was passed {dropna}).".format(dropna=dropna) + f"(was passed {dropna})." ) # old behaviour, but with all and any support for DataFrames. @@ -2531,7 +2529,7 @@ def get_groupby( klass = DataFrameGroupBy else: - raise TypeError("invalid type: {obj}".format(obj=obj)) + raise TypeError(f"invalid type: {obj}") return klass( obj=obj, diff --git a/pandas/core/groupby/grouper.py b/pandas/core/groupby/grouper.py index c37617b1f1f7f..2b946d1ff0a7a 100644 --- a/pandas/core/groupby/grouper.py +++ b/pandas/core/groupby/grouper.py @@ -173,9 +173,7 @@ def _set_grouper(self, obj: FrameOrSeries, sort: bool = False): ax = self._grouper.take(obj.index) else: if key not in obj._info_axis: - raise KeyError( - "The grouper name {key} is not found".format(key=key) - ) + raise KeyError(f"The grouper name {key} is not found") ax = Index(obj[key], name=key) else: @@ -191,9 +189,7 @@ def _set_grouper(self, obj: FrameOrSeries, sort: bool = False): else: if level not in (0, ax.name): - raise ValueError( - "The level {level} is not valid".format(level=level) - ) + raise ValueError(f"The level {level} is not valid") # possibly sort if (self.sort or sort) and not ax.is_monotonic: @@ -212,13 +208,13 @@ def groups(self): def __repr__(self) -> str: attrs_list = ( - "{name}={val!r}".format(name=attr_name, val=getattr(self, attr_name)) + f"{attr_name}={getattr(self, attr_name)!r}" for attr_name in self._attributes if getattr(self, attr_name) is not None ) attrs = ", ".join(attrs_list) cls_name = self.__class__.__name__ - return "{cls}({attrs})".format(cls=cls_name, attrs=attrs) + return f"{cls_name}({attrs})" class Grouping: @@ -280,9 +276,7 @@ def __init__( if level is not None: if not isinstance(level, int): if level not in index.names: - raise AssertionError( - "Level {level} not in index".format(level=level) - ) + raise AssertionError(f"Level {level} not in index") level = index.names.index(level) if self.name is None: @@ -350,17 +344,16 @@ def __init__( ): if getattr(self.grouper, "ndim", 1) != 1: t = self.name or str(type(self.grouper)) - raise ValueError("Grouper for '{t}' not 1-dimensional".format(t=t)) + raise ValueError(f"Grouper for '{t}' not 1-dimensional") self.grouper = self.index.map(self.grouper) if not ( hasattr(self.grouper, "__len__") and len(self.grouper) == len(self.index) ): + grper = pprint_thing(self.grouper) errmsg = ( "Grouper result violates len(labels) == " - "len(data)\nresult: {grper}".format( - grper=pprint_thing(self.grouper) - ) + f"len(data)\nresult: {grper}" ) self.grouper = None # Try for sanity raise AssertionError(errmsg) @@ -375,7 +368,7 @@ def __init__( self.grouper = self.grouper.astype("timedelta64[ns]") def __repr__(self) -> str: - return "Grouping({name})".format(name=self.name) + return f"Grouping({self.name})" def __iter__(self): return iter(self.indices) @@ -500,11 +493,7 @@ def get_grouper( if isinstance(level, str): if obj.index.name != level: - raise ValueError( - "level name {level} is not the name of the index".format( - level=level - ) - ) + raise ValueError(f"level name {level} is not the name of the index") elif level > 0 or level < -1: raise ValueError("level > 0 or level < -1 only valid with MultiIndex") @@ -636,12 +625,8 @@ def is_in_obj(gpr) -> bool: if is_categorical_dtype(gpr) and len(gpr) != obj.shape[axis]: raise ValueError( - ( - "Length of grouper ({len_gpr}) and axis ({len_axis})" - " must be same length".format( - len_gpr=len(gpr), len_axis=obj.shape[axis] - ) - ) + f"Length of grouper ({len(gpr)}) and axis ({obj.shape[axis]})" + " must be same length" ) # create the Grouping diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index 7fd9fb8f53134..a7e0a901a5394 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -452,18 +452,16 @@ def _cython_operation( # categoricals are only 1d, so we # are not setup for dim transforming if is_categorical_dtype(values) or is_sparse(values): - raise NotImplementedError( - "{dtype} dtype not supported".format(dtype=values.dtype) - ) + raise NotImplementedError(f"{values.dtype} dtype not supported") elif is_datetime64_any_dtype(values): if how in ["add", "prod", "cumsum", "cumprod"]: raise NotImplementedError( - "datetime64 type does not support {how} operations".format(how=how) + f"datetime64 type does not support {how} operations" ) elif is_timedelta64_dtype(values): if how in ["prod", "cumprod"]: raise NotImplementedError( - "timedelta64 type does not support {how} operations".format(how=how) + f"timedelta64 type does not support {how} operations" ) if is_datetime64tz_dtype(values.dtype): @@ -516,9 +514,7 @@ def _cython_operation( out_dtype = "float" else: if is_numeric: - out_dtype = "{kind}{itemsize}".format( - kind=values.dtype.kind, itemsize=values.dtype.itemsize - ) + out_dtype = f"{values.dtype.kind}{values.dtype.itemsize}" else: out_dtype = "object" From b4431805f325e956af238ee8bbeb0446486dabbd Mon Sep 17 00:00:00 2001 From: Marc Garcia Date: Thu, 21 Nov 2019 13:06:07 +0000 Subject: [PATCH 150/185] DEPS: Unifying testing and building dependencies across builds (#29678) --- ci/deps/azure-36-32bit.yaml | 22 +++++++++++++--------- ci/deps/azure-36-locale.yaml | 19 +++++++++++-------- ci/deps/azure-36-locale_slow.yaml | 21 +++++++++++---------- ci/deps/azure-36-minimum_versions.yaml | 19 +++++++++++-------- ci/deps/azure-37-locale.yaml | 21 +++++++++++---------- ci/deps/azure-37-numpydev.yaml | 17 +++++++++-------- ci/deps/azure-macos-36.yaml | 21 +++++++++++---------- ci/deps/azure-windows-36.yaml | 19 +++++++++++-------- ci/deps/azure-windows-37.yaml | 19 +++++++++++-------- ci/deps/travis-36-cov.yaml | 18 +++++++++++------- ci/deps/travis-36-locale.yaml | 19 ++++++++++--------- ci/deps/travis-36-slow.yaml | 17 ++++++++++------- ci/deps/travis-37.yaml | 17 ++++++++++------- ci/deps/travis-38.yaml | 11 +++++++---- environment.yml | 2 +- requirements-dev.txt | 2 +- 16 files changed, 149 insertions(+), 115 deletions(-) diff --git a/ci/deps/azure-36-32bit.yaml b/ci/deps/azure-36-32bit.yaml index 1e2e6c33e8c15..f3e3d577a7a33 100644 --- a/ci/deps/azure-36-32bit.yaml +++ b/ci/deps/azure-36-32bit.yaml @@ -3,21 +3,25 @@ channels: - defaults - conda-forge dependencies: + - python=3.6.* + + # tools + ### Cython 0.29.13 and pytest 5.0.1 for 32 bits are not available with conda, installing below with pip instead + - pytest-xdist>=1.21 + - pytest-mock + - hypothesis>=3.58.0 + - pytest-azurepipelines + + # pandas dependencies - attrs=19.1.0 - gcc_linux-32 - - gcc_linux-32 - gxx_linux-32 - numpy=1.14.* - python-dateutil - - python=3.6.* - pytz=2017.2 - # universal - - pytest - - pytest-xdist - - pytest-mock - - pytest-azurepipelines - - hypothesis>=3.58.0 + + # see comment above - pip - pip: - # Anaconda doesn't build a new enough Cython - cython>=0.29.13 + - pytest>=5.0.1 diff --git a/ci/deps/azure-36-locale.yaml b/ci/deps/azure-36-locale.yaml index 76868f598f11b..3baf975afc096 100644 --- a/ci/deps/azure-36-locale.yaml +++ b/ci/deps/azure-36-locale.yaml @@ -3,28 +3,31 @@ channels: - defaults - conda-forge dependencies: + - python=3.6.* + + # tools + - cython>=0.29.13 + - pytest>=5.0.1 + - pytest-xdist>=1.21 + - pytest-mock + - hypothesis>=3.58.0 + - pytest-azurepipelines + + # pandas dependencies - beautifulsoup4==4.6.0 - bottleneck=1.2.* - - cython=0.29.13 - lxml - matplotlib=2.2.2 - numpy=1.14.* - openpyxl=2.4.8 - python-dateutil - python-blosc - - python=3.6.* - pytz=2017.2 - scipy - sqlalchemy=1.1.4 - xlrd=1.1.0 - xlsxwriter=0.9.8 - xlwt=1.2.0 - # universal - - pytest>=5.0.0 - - pytest-xdist>=1.29.0 - - pytest-mock - - pytest-azurepipelines - - hypothesis>=3.58.0 - pip - pip: - html5lib==1.0b2 diff --git a/ci/deps/azure-36-locale_slow.yaml b/ci/deps/azure-36-locale_slow.yaml index 21205375204dc..01741e9b65a7a 100644 --- a/ci/deps/azure-36-locale_slow.yaml +++ b/ci/deps/azure-36-locale_slow.yaml @@ -3,8 +3,18 @@ channels: - defaults - conda-forge dependencies: - - beautifulsoup4 + - python=3.6.* + + # tools - cython>=0.29.13 + - pytest>=5.0.1 + - pytest-xdist>=1.21 + - pytest-mock + - hypothesis>=3.58.0 + - pytest-azurepipelines + + # pandas dependencies + - beautifulsoup4 - gcsfs - html5lib - ipython @@ -17,7 +27,6 @@ dependencies: - openpyxl - pytables - python-dateutil - - python=3.6.* - pytz - s3fs - scipy @@ -25,12 +34,4 @@ dependencies: - xlrd - xlsxwriter - xlwt - # universal - - pytest>=4.0.2 - - pytest-xdist - - pytest-mock - - pytest-azurepipelines - moto - - pip - - pip: - - hypothesis>=3.58.0 diff --git a/ci/deps/azure-36-minimum_versions.yaml b/ci/deps/azure-36-minimum_versions.yaml index e2c78165fe4b9..1e32ef7482be3 100644 --- a/ci/deps/azure-36-minimum_versions.yaml +++ b/ci/deps/azure-36-minimum_versions.yaml @@ -3,25 +3,28 @@ channels: - defaults - conda-forge dependencies: + - python=3.6.1 + + # tools + - cython=0.29.13 + - pytest=5.0.1 + - pytest-xdist>=1.21 + - pytest-mock + - hypothesis>=3.58.0 + - pytest-azurepipelines + + # pandas dependencies - beautifulsoup4=4.6.0 - bottleneck=1.2.1 - - cython>=0.29.13 - jinja2=2.8 - numexpr=2.6.2 - numpy=1.13.3 - openpyxl=2.4.8 - pytables=3.4.2 - python-dateutil=2.6.1 - - python=3.6.1 - pytz=2017.2 - scipy=0.19.0 - xlrd=1.1.0 - xlsxwriter=0.9.8 - xlwt=1.2.0 - # universal - html5lib=1.0.1 - - hypothesis>=3.58.0 - - pytest=4.5.0 - - pytest-xdist - - pytest-mock - - pytest-azurepipelines diff --git a/ci/deps/azure-37-locale.yaml b/ci/deps/azure-37-locale.yaml index 24464adb74f5b..26446ab5365b1 100644 --- a/ci/deps/azure-37-locale.yaml +++ b/ci/deps/azure-37-locale.yaml @@ -3,8 +3,18 @@ channels: - defaults - conda-forge dependencies: - - beautifulsoup4 + - python=3.7.* + + # tools - cython>=0.29.13 + - pytest>=5.0.1 + - pytest-xdist>=1.21 + - pytest-mock + - hypothesis>=3.58.0 + - pytest-azurepipelines + + # pandas dependencies + - beautifulsoup4 - html5lib - ipython - jinja2 @@ -17,7 +27,6 @@ dependencies: - openpyxl - pytables - python-dateutil - - python=3.7.* - pytz - s3fs - scipy @@ -25,11 +34,3 @@ dependencies: - xlrd - xlsxwriter - xlwt - # universal - - pytest>=5.0.1 - - pytest-xdist>=1.29.0 - - pytest-mock - - pytest-azurepipelines - - pip - - pip: - - hypothesis>=3.58.0 diff --git a/ci/deps/azure-37-numpydev.yaml b/ci/deps/azure-37-numpydev.yaml index 0fb06fd43724c..3264df5944e35 100644 --- a/ci/deps/azure-37-numpydev.yaml +++ b/ci/deps/azure-37-numpydev.yaml @@ -3,14 +3,17 @@ channels: - defaults dependencies: - python=3.7.* - - pytz - - Cython>=0.29.13 - # universal - # pytest < 5 until defaults has pytest-xdist>=1.29.0 - - pytest>=4.0.2,<5.0 - - pytest-xdist + + # tools + - cython>=0.29.13 + - pytest>=5.0.1 + - pytest-xdist>=1.21 - pytest-mock - hypothesis>=3.58.0 + - pytest-azurepipelines + + # pandas dependencies + - pytz - pip - pip: - "git+git://github.com/dateutil/dateutil.git" @@ -18,5 +21,3 @@ dependencies: - "--pre" - "numpy" - "scipy" - # https://github.com/pandas-dev/pandas/issues/27421 - - pytest-azurepipelines<1.0.0 diff --git a/ci/deps/azure-macos-36.yaml b/ci/deps/azure-macos-36.yaml index 85c090bf6f938..48ba87d26f53d 100644 --- a/ci/deps/azure-macos-36.yaml +++ b/ci/deps/azure-macos-36.yaml @@ -2,6 +2,17 @@ name: pandas-dev channels: - defaults dependencies: + - python=3.6.* + + # tools + - cython>=0.29.13 + - pytest>=5.0.1 + - pytest-xdist>=1.21 + - pytest-mock + - hypothesis>=3.58.0 + - pytest-azurepipelines + + # pandas dependencies - beautifulsoup4 - bottleneck - html5lib @@ -14,7 +25,6 @@ dependencies: - openpyxl - pyarrow - pytables - - python=3.6.* - python-dateutil==2.6.1 - pytz - xarray @@ -23,13 +33,4 @@ dependencies: - xlwt - pip - pip: - # Anaconda / conda-forge don't build for 3.5 - - cython>=0.29.13 - pyreadstat - # universal - - pytest>=5.0.1 - - pytest-xdist>=1.29.0 - - pytest-mock - - hypothesis>=3.58.0 - # https://github.com/pandas-dev/pandas/issues/27421 - - pytest-azurepipelines<1.0.0 diff --git a/ci/deps/azure-windows-36.yaml b/ci/deps/azure-windows-36.yaml index 88b38aaef237c..e3ad1d8371623 100644 --- a/ci/deps/azure-windows-36.yaml +++ b/ci/deps/azure-windows-36.yaml @@ -3,6 +3,17 @@ channels: - conda-forge - defaults dependencies: + - python=3.6.* + + # tools + - cython>=0.29.13 + - pytest>=5.0.1 + - pytest-xdist>=1.21 + - pytest-mock + - hypothesis>=3.58.0 + - pytest-azurepipelines + + # pandas dependencies - blosc - bottleneck - fastparquet>=0.2.1 @@ -13,16 +24,8 @@ dependencies: - pyarrow - pytables - python-dateutil - - python=3.6.* - pytz - scipy - xlrd - xlsxwriter - xlwt - # universal - - cython>=0.29.13 - - pytest>=5.0.1 - - pytest-xdist>=1.29.0 - - pytest-mock - - pytest-azurepipelines - - hypothesis>=3.58.0 diff --git a/ci/deps/azure-windows-37.yaml b/ci/deps/azure-windows-37.yaml index 7680ed9fd9c92..07e134b054c10 100644 --- a/ci/deps/azure-windows-37.yaml +++ b/ci/deps/azure-windows-37.yaml @@ -3,6 +3,17 @@ channels: - defaults - conda-forge dependencies: + - python=3.7.* + + # tools + - cython>=0.29.13 + - pytest>=5.0.1 + - pytest-xdist>=1.21 + - pytest-mock + - hypothesis>=3.58.0 + - pytest-azurepipelines + + # pandas dependencies - beautifulsoup4 - bottleneck - gcsfs @@ -15,7 +26,6 @@ dependencies: - numpy=1.14.* - openpyxl - pytables - - python=3.7.* - python-dateutil - pytz - s3fs @@ -24,11 +34,4 @@ dependencies: - xlrd - xlsxwriter - xlwt - # universal - - cython>=0.29.13 - - pytest>=5.0.0 - - pytest-xdist>=1.29.0 - - pytest-mock - - pytest-azurepipelines - - hypothesis>=3.58.0 - pyreadstat diff --git a/ci/deps/travis-36-cov.yaml b/ci/deps/travis-36-cov.yaml index b2a74fceaf0fa..9148e0d4b29d9 100644 --- a/ci/deps/travis-36-cov.yaml +++ b/ci/deps/travis-36-cov.yaml @@ -3,6 +3,17 @@ channels: - defaults - conda-forge dependencies: + - python=3.6.* + + # tools + - cython>=0.29.13 + - pytest>=5.0.1 + - pytest-xdist>=1.21 + - pytest-mock + - hypothesis>=3.58.0 + - pytest-cov # this is only needed in the coverage build + + # pandas dependencies - beautifulsoup4 - botocore>=1.11 - cython>=0.29.13 @@ -27,7 +38,6 @@ dependencies: - pymysql - pytables - python-snappy - - python=3.6.* - pytz - s3fs - scikit-learn @@ -38,12 +48,6 @@ dependencies: - xlrd - xlsxwriter - xlwt - # universal - - pytest>=5.0.1 - - pytest-xdist>=1.29.0 - - pytest-cov - - pytest-mock - - hypothesis>=3.58.0 - pip - pip: - brotlipy diff --git a/ci/deps/travis-36-locale.yaml b/ci/deps/travis-36-locale.yaml index 09f72e65098c9..3199ee037bc0a 100644 --- a/ci/deps/travis-36-locale.yaml +++ b/ci/deps/travis-36-locale.yaml @@ -3,10 +3,19 @@ channels: - defaults - conda-forge dependencies: + - python=3.6.* + + # tools + - cython>=0.29.13 + - pytest>=5.0.1 + - pytest-xdist>=1.21 + - pytest-mock + - hypothesis>=3.58.0 + + # pandas dependencies - beautifulsoup4 - blosc=1.14.3 - python-blosc - - cython>=0.29.13 - fastparquet=0.2.1 - gcsfs=0.2.2 - html5lib @@ -24,7 +33,6 @@ dependencies: - pymysql=0.7.11 - pytables - python-dateutil - - python=3.6.* - pytz - s3fs=0.3.0 - scipy @@ -33,10 +41,3 @@ dependencies: - xlrd - xlsxwriter - xlwt - # universal - - pytest>=5.0.1 - - pytest-xdist>=1.29.0 - - pytest-mock - - pip - - pip: - - hypothesis>=3.58.0 diff --git a/ci/deps/travis-36-slow.yaml b/ci/deps/travis-36-slow.yaml index e9c5dadbc924a..eab374c96772c 100644 --- a/ci/deps/travis-36-slow.yaml +++ b/ci/deps/travis-36-slow.yaml @@ -3,8 +3,17 @@ channels: - defaults - conda-forge dependencies: - - beautifulsoup4 + - python=3.6.* + + # tools - cython>=0.29.13 + - pytest>=5.0.1 + - pytest-xdist>=1.21 + - pytest-mock + - hypothesis>=3.58.0 + + # pandas dependencies + - beautifulsoup4 - html5lib - lxml - matplotlib @@ -16,7 +25,6 @@ dependencies: - pymysql - pytables - python-dateutil - - python=3.6.* - pytz - s3fs - scipy @@ -24,9 +32,4 @@ dependencies: - xlrd - xlsxwriter - xlwt - # universal - - pytest>=5.0.0 - - pytest-xdist>=1.29.0 - - pytest-mock - moto - - hypothesis>=3.58.0 diff --git a/ci/deps/travis-37.yaml b/ci/deps/travis-37.yaml index 903636f2fe060..7b75a427a4954 100644 --- a/ci/deps/travis-37.yaml +++ b/ci/deps/travis-37.yaml @@ -5,20 +5,23 @@ channels: - c3i_test dependencies: - python=3.7.* - - botocore>=1.11 + + # tools - cython>=0.29.13 + - pytest>=5.0.1 + - pytest-xdist>=1.21 + - pytest-mock + - hypothesis>=3.58.0 + + # pandas dependencies + - botocore>=1.11 - numpy - python-dateutil - nomkl - pyarrow - pytz - # universal - - pytest>=5.0.0 - - pytest-xdist>=1.29.0 - - pytest-mock - - hypothesis>=3.58.0 - s3fs - - pip - pyreadstat + - pip - pip: - moto diff --git a/ci/deps/travis-38.yaml b/ci/deps/travis-38.yaml index bd62ffa9248fe..88da1331b463a 100644 --- a/ci/deps/travis-38.yaml +++ b/ci/deps/travis-38.yaml @@ -4,13 +4,16 @@ channels: - conda-forge dependencies: - python=3.8.* + + # tools - cython>=0.29.13 + - pytest>=5.0.1 + - pytest-xdist>=1.29.0 # The rest of the builds use >=1.21, and use pytest-mock + - hypothesis>=3.58.0 + + # pandas dependencies - numpy - python-dateutil - nomkl - pytz - # universal - - pytest>=5.0.0 - - pytest-xdist>=1.29.0 - - hypothesis>=3.58.0 - pip diff --git a/environment.yml b/environment.yml index ef5767f26dceb..54c99f415165d 100644 --- a/environment.yml +++ b/environment.yml @@ -51,7 +51,7 @@ dependencies: - botocore>=1.11 - hypothesis>=3.82 - moto # mock S3 - - pytest>=4.0.2 + - pytest>=5.0.1 - pytest-cov - pytest-xdist>=1.21 - seaborn diff --git a/requirements-dev.txt b/requirements-dev.txt index 3ae5b57de5d02..87b348c39a17b 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -30,7 +30,7 @@ boto3 botocore>=1.11 hypothesis>=3.82 moto -pytest>=4.0.2 +pytest>=5.0.1 pytest-cov pytest-xdist>=1.21 seaborn From e29a341d87d122fc2f8e349a1837be197830557e Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Thu, 21 Nov 2019 05:07:57 -0800 Subject: [PATCH 151/185] REF: dont _try_cast for user-defined functions (#29698) --- doc/source/whatsnew/v1.0.0.rst | 1 + pandas/core/apply.py | 2 ++ pandas/core/groupby/generic.py | 6 ++++-- 3 files changed, 7 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index 3b87150f544cf..db24be628dd67 100644 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -511,6 +511,7 @@ Groupby/resample/rolling - Bug in :meth:`DataFrame.groupby` losing column name information when grouping by a categorical column (:issue:`28787`) - Bug in :meth:`DataFrameGroupBy.rolling().quantile()` ignoring ``interpolation`` keyword argument (:issue:`28779`) - Bug in :meth:`DataFrame.groupby` where ``any``, ``all``, ``nunique`` and transform functions would incorrectly handle duplicate column labels (:issue:`21668`) +- Reshaping ^^^^^^^^^ diff --git a/pandas/core/apply.py b/pandas/core/apply.py index 071cd116ea982..34a8ed1fa7a83 100644 --- a/pandas/core/apply.py +++ b/pandas/core/apply.py @@ -226,6 +226,8 @@ def apply_raw(self): if "Function does not reduce" not in str(err): # catch only ValueError raised intentionally in libreduction raise + # We expect np.apply_along_axis to give a two-dimensional result, or + # also raise. result = np.apply_along_axis(self.f, self.axis, self.values) # TODO: mixed type case diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index b62f2888949b9..900e11dedb8b1 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -1109,12 +1109,12 @@ def _aggregate_frame(self, func, *args, **kwargs) -> DataFrame: if axis != obj._info_axis_number: for name, data in self: fres = func(data, *args, **kwargs) - result[name] = self._try_cast(fres, data) + result[name] = fres else: for name in self.indices: data = self.get_group(name, obj=obj) fres = func(data, *args, **kwargs) - result[name] = self._try_cast(fres, data) + result[name] = fres return self._wrap_frame_output(result, obj) @@ -1424,6 +1424,8 @@ def _transform_fast(self, result: DataFrame, func_nm: str) -> DataFrame: output = [] for i, _ in enumerate(result.columns): res = algorithms.take_1d(result.iloc[:, i].values, ids) + # TODO: we have no test cases that get here with EA dtypes; + # try_cast may not be needed if EAs never get here if cast: res = self._try_cast(res, obj.iloc[:, i]) output.append(res) From 4090d43b8f7860e48a2f7c6e7f90b0f98aca3118 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Thu, 21 Nov 2019 05:24:18 -0800 Subject: [PATCH 152/185] DEPR: remove is_period, is_datetimetz (#29744) --- .../reference/general_utility_functions.rst | 2 - doc/source/whatsnew/v1.0.0.rst | 1 + pandas/core/dtypes/api.py | 2 - pandas/core/dtypes/common.py | 87 ------------------- pandas/tests/api/test_types.py | 2 +- pandas/tests/dtypes/test_common.py | 19 ---- pandas/tests/dtypes/test_dtypes.py | 20 ----- 7 files changed, 2 insertions(+), 131 deletions(-) diff --git a/doc/source/reference/general_utility_functions.rst b/doc/source/reference/general_utility_functions.rst index 9c69770c0f1b7..0961acc43f301 100644 --- a/doc/source/reference/general_utility_functions.rst +++ b/doc/source/reference/general_utility_functions.rst @@ -97,13 +97,11 @@ Scalar introspection api.types.is_bool api.types.is_categorical api.types.is_complex - api.types.is_datetimetz api.types.is_float api.types.is_hashable api.types.is_integer api.types.is_interval api.types.is_number - api.types.is_period api.types.is_re api.types.is_re_compilable api.types.is_scalar diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index db24be628dd67..d110b28754d3a 100644 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -325,6 +325,7 @@ or ``matplotlib.Axes.plot``. See :ref:`plotting.formatters` for more. - :meth:`DataFrame.to_records` no longer supports the argument "convert_datetime64" (:issue:`18902`) - Removed the previously deprecated ``IntervalIndex.from_intervals`` in favor of the :class:`IntervalIndex` constructor (:issue:`19263`) - Changed the default value for the "keep_tz" argument in :meth:`DatetimeIndex.to_series` to ``True`` (:issue:`23739`) +- Removed the previously deprecated :func:`api.types.is_period` and :func:`api.types.is_datetimetz` (:issue:`23917`) - Ability to read pickles containing :class:`Categorical` instances created with pre-0.16 version of pandas has been removed (:issue:`27538`) - Removed previously deprecated :func:`pandas.tseries.plotting.tsplot` (:issue:`18627`) - Removed the previously deprecated ``reduce`` and ``broadcast`` arguments from :meth:`DataFrame.apply` (:issue:`18577`) diff --git a/pandas/core/dtypes/api.py b/pandas/core/dtypes/api.py index 2b527e1fb5890..cb0912cbcf880 100644 --- a/pandas/core/dtypes/api.py +++ b/pandas/core/dtypes/api.py @@ -12,7 +12,6 @@ is_datetime64_dtype, is_datetime64_ns_dtype, is_datetime64tz_dtype, - is_datetimetz, is_dict_like, is_dtype_equal, is_extension_array_dtype, @@ -32,7 +31,6 @@ is_number, is_numeric_dtype, is_object_dtype, - is_period, is_period_dtype, is_re, is_re_compilable, diff --git a/pandas/core/dtypes/common.py b/pandas/core/dtypes/common.py index dcc8a274492ee..783669688ea42 100644 --- a/pandas/core/dtypes/common.py +++ b/pandas/core/dtypes/common.py @@ -375,56 +375,6 @@ def is_categorical(arr) -> bool: return isinstance(arr, ABCCategorical) or is_categorical_dtype(arr) -def is_datetimetz(arr) -> bool: - """ - Check whether an array-like is a datetime array-like with a timezone - component in its dtype. - - .. deprecated:: 0.24.0 - - Parameters - ---------- - arr : array-like - The array-like to check. - - Returns - ------- - boolean - Whether or not the array-like is a datetime array-like with a - timezone component in its dtype. - - Examples - -------- - >>> is_datetimetz([1, 2, 3]) - False - - Although the following examples are both DatetimeIndex objects, - the first one returns False because it has no timezone component - unlike the second one, which returns True. - - >>> is_datetimetz(pd.DatetimeIndex([1, 2, 3])) - False - >>> is_datetimetz(pd.DatetimeIndex([1, 2, 3], tz="US/Eastern")) - True - - The object need not be a DatetimeIndex object. It just needs to have - a dtype which has a timezone component. - - >>> dtype = DatetimeTZDtype("ns", tz="US/Eastern") - >>> s = pd.Series([], dtype=dtype) - >>> is_datetimetz(s) - True - """ - - warnings.warn( - "'is_datetimetz' is deprecated and will be removed in a " - "future version. Use 'is_datetime64tz_dtype' instead.", - FutureWarning, - stacklevel=2, - ) - return is_datetime64tz_dtype(arr) - - def is_offsetlike(arr_or_obj) -> bool: """ Check if obj or all elements of list-like is DateOffset @@ -456,43 +406,6 @@ def is_offsetlike(arr_or_obj) -> bool: return False -def is_period(arr) -> bool: - """ - Check whether an array-like is a periodical index. - - .. deprecated:: 0.24.0 - - Parameters - ---------- - arr : array-like - The array-like to check. - - Returns - ------- - boolean - Whether or not the array-like is a periodical index. - - Examples - -------- - >>> is_period([1, 2, 3]) - False - >>> is_period(pd.Index([1, 2, 3])) - False - >>> is_period(pd.PeriodIndex(["2017-01-01"], freq="D")) - True - """ - - warnings.warn( - "'is_period' is deprecated and will be removed in a future " - "version. Use 'is_period_dtype' or is_period_arraylike' " - "instead.", - FutureWarning, - stacklevel=2, - ) - - return isinstance(arr, ABCPeriodIndex) or is_period_arraylike(arr) - - def is_datetime64_dtype(arr_or_dtype) -> bool: """ Check whether an array-like or dtype is of the datetime64 dtype. diff --git a/pandas/tests/api/test_types.py b/pandas/tests/api/test_types.py index e9f68692a9863..97480502f192c 100644 --- a/pandas/tests/api/test_types.py +++ b/pandas/tests/api/test_types.py @@ -50,7 +50,7 @@ class TestTypes(Base): "infer_dtype", "is_extension_array_dtype", ] - deprecated = ["is_period", "is_datetimetz", "is_extension_type"] + deprecated = ["is_extension_type"] dtypes = ["CategoricalDtype", "DatetimeTZDtype", "PeriodDtype", "IntervalDtype"] def test_types(self): diff --git a/pandas/tests/dtypes/test_common.py b/pandas/tests/dtypes/test_common.py index d8420673104d5..ae625ed8e389f 100644 --- a/pandas/tests/dtypes/test_common.py +++ b/pandas/tests/dtypes/test_common.py @@ -207,25 +207,6 @@ def test_is_categorical(): assert not com.is_categorical([1, 2, 3]) -def test_is_datetimetz(): - with tm.assert_produces_warning(FutureWarning): - assert not com.is_datetimetz([1, 2, 3]) - assert not com.is_datetimetz(pd.DatetimeIndex([1, 2, 3])) - - assert com.is_datetimetz(pd.DatetimeIndex([1, 2, 3], tz="US/Eastern")) - - dtype = DatetimeTZDtype("ns", tz="US/Eastern") - s = pd.Series([], dtype=dtype) - assert com.is_datetimetz(s) - - -def test_is_period_deprecated(): - with tm.assert_produces_warning(FutureWarning): - assert not com.is_period([1, 2, 3]) - assert not com.is_period(pd.Index([1, 2, 3])) - assert com.is_period(pd.PeriodIndex(["2017-01-01"], freq="D")) - - def test_is_datetime64_dtype(): assert not com.is_datetime64_dtype(object) assert not com.is_datetime64_dtype([1, 2, 3]) diff --git a/pandas/tests/dtypes/test_dtypes.py b/pandas/tests/dtypes/test_dtypes.py index f4bf4c1fc83d9..fc896e6a9d348 100644 --- a/pandas/tests/dtypes/test_dtypes.py +++ b/pandas/tests/dtypes/test_dtypes.py @@ -12,10 +12,8 @@ is_datetime64_dtype, is_datetime64_ns_dtype, is_datetime64tz_dtype, - is_datetimetz, is_dtype_equal, is_interval_dtype, - is_period, is_period_dtype, is_string_dtype, ) @@ -294,25 +292,15 @@ def test_basic(self): assert not is_datetime64tz_dtype(np.dtype("float64")) assert not is_datetime64tz_dtype(1.0) - with tm.assert_produces_warning(FutureWarning): - assert is_datetimetz(s) - assert is_datetimetz(s.dtype) - assert not is_datetimetz(np.dtype("float64")) - assert not is_datetimetz(1.0) - def test_dst(self): dr1 = date_range("2013-01-01", periods=3, tz="US/Eastern") s1 = Series(dr1, name="A") assert is_datetime64tz_dtype(s1) - with tm.assert_produces_warning(FutureWarning): - assert is_datetimetz(s1) dr2 = date_range("2013-08-01", periods=3, tz="US/Eastern") s2 = Series(dr2, name="A") assert is_datetime64tz_dtype(s2) - with tm.assert_produces_warning(FutureWarning): - assert is_datetimetz(s2) assert s1.dtype == s2.dtype @pytest.mark.parametrize("tz", ["UTC", "US/Eastern"]) @@ -457,22 +445,14 @@ def test_basic(self): assert is_period_dtype(pidx.dtype) assert is_period_dtype(pidx) - with tm.assert_produces_warning(FutureWarning): - assert is_period(pidx) s = Series(pidx, name="A") assert is_period_dtype(s.dtype) assert is_period_dtype(s) - with tm.assert_produces_warning(FutureWarning): - assert is_period(s) assert not is_period_dtype(np.dtype("float64")) assert not is_period_dtype(1.0) - with tm.assert_produces_warning(FutureWarning): - assert not is_period(np.dtype("float64")) - with tm.assert_produces_warning(FutureWarning): - assert not is_period(1.0) def test_empty(self): dt = PeriodDtype() From f4d1a84370fdc0d4a9a3f3597da58f30762b8d24 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Thu, 21 Nov 2019 05:29:11 -0800 Subject: [PATCH 153/185] DEPR: remove Series.valid, is_copy, get_ftype_counts, Index.get_duplicate, Series.clip_upper, clip_lower (#29724) --- doc/source/reference/frame.rst | 4 - doc/source/reference/series.rst | 3 - doc/source/whatsnew/v1.0.0.rst | 5 + pandas/core/generic.py | 279 +----------------- pandas/core/internals/managers.py | 7 - pandas/core/series.py | 20 -- pandas/tests/frame/test_analytics.py | 21 +- pandas/tests/generic/test_series.py | 5 - .../indexing/test_chaining_and_caching.py | 11 - pandas/tests/series/test_analytics.py | 17 +- pandas/tests/series/test_dtypes.py | 5 - 11 files changed, 9 insertions(+), 368 deletions(-) diff --git a/doc/source/reference/frame.rst b/doc/source/reference/frame.rst index 37d27093efefd..4540504974f56 100644 --- a/doc/source/reference/frame.rst +++ b/doc/source/reference/frame.rst @@ -30,7 +30,6 @@ Attributes and underlying data DataFrame.dtypes DataFrame.ftypes DataFrame.get_dtype_counts - DataFrame.get_ftype_counts DataFrame.select_dtypes DataFrame.values DataFrame.get_values @@ -40,7 +39,6 @@ Attributes and underlying data DataFrame.shape DataFrame.memory_usage DataFrame.empty - DataFrame.is_copy Conversion ~~~~~~~~~~ @@ -142,8 +140,6 @@ Computations / descriptive stats DataFrame.all DataFrame.any DataFrame.clip - DataFrame.clip_lower - DataFrame.clip_upper DataFrame.compound DataFrame.corr DataFrame.corrwith diff --git a/doc/source/reference/series.rst b/doc/source/reference/series.rst index 59910ba357130..c501e8bc91379 100644 --- a/doc/source/reference/series.rst +++ b/doc/source/reference/series.rst @@ -45,7 +45,6 @@ Attributes Series.dtypes Series.ftypes Series.data - Series.is_copy Series.name Series.put @@ -148,8 +147,6 @@ Computations / descriptive stats Series.autocorr Series.between Series.clip - Series.clip_lower - Series.clip_upper Series.corr Series.count Series.cov diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index d110b28754d3a..70c30eb42169b 100644 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -333,6 +333,11 @@ or ``matplotlib.Axes.plot``. See :ref:`plotting.formatters` for more. - Removed previously deprecated "nthreads" argument from :func:`read_feather`, use "use_threads" instead (:issue:`23053`) - Removed :meth:`Index.is_lexsorted_for_tuple` (:issue:`29305`) - Removed support for nexted renaming in :meth:`DataFrame.aggregate`, :meth:`Series.aggregate`, :meth:`DataFrameGroupBy.aggregate`, :meth:`SeriesGroupBy.aggregate`, :meth:`Rolling.aggregate` (:issue:`29608`) +- Removed the previously deprecated :meth:`Series.valid`; use :meth:`Series.dropna` instead (:issue:`18800`) +- Removed the previously properties :attr:`DataFrame.is_copy`, :attr:`Series.is_copy` (:issue:`18812`) +- Removed the previously deprecated :meth:`DataFrame.get_ftype_counts`, :meth:`Series.get_ftype_counts` (:issue:`18243`) +- Removed the previously deprecated :meth:`Index.get_duplicated`, use ``idx[idx.duplicated()].unique()`` instead (:issue:`20239`) +- Removed the previously deprecated :meth:`Series.clip_upper`, :meth:`Series.clip_lower`, :meth:`DataFrame.clip_upper`, :meth:`DataFrame.clip_lower` (:issue:`24203`) - Removed previously deprecated "order" argument from :func:`factorize` (:issue:`19751`) - Removed previously deprecated "v" argument from :meth:`FrozenNDarray.searchsorted`, use "value" instead (:issue:`22672`) - :func:`read_stata` and :meth:`DataFrame.to_stata` no longer supports the "encoding" argument (:issue:`21400`) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 6fbe95fa973cb..7f83bb9e69f7a 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -172,16 +172,7 @@ class NDFrame(PandasObject, SelectionMixin): _internal_names_set = set(_internal_names) # type: Set[str] _accessors = set() # type: Set[str] _deprecations = frozenset( - [ - "clip_lower", - "clip_upper", - "get_dtype_counts", - "get_ftype_counts", - "get_values", - "is_copy", - "ftypes", - "ix", - ] + ["get_dtype_counts", "get_values", "ftypes", "ix"] ) # type: FrozenSet[str] _metadata = [] # type: List[str] _is_copy = None @@ -252,29 +243,6 @@ def attrs(self) -> Dict[Optional[Hashable], Any]: def attrs(self, value: Mapping[Optional[Hashable], Any]) -> None: self._attrs = dict(value) - @property - def is_copy(self): - """ - Return the copy. - """ - warnings.warn( - "Attribute 'is_copy' is deprecated and will be removed " - "in a future version.", - FutureWarning, - stacklevel=2, - ) - return self._is_copy - - @is_copy.setter - def is_copy(self, msg): - warnings.warn( - "Attribute 'is_copy' is deprecated and will be removed " - "in a future version.", - FutureWarning, - stacklevel=2, - ) - self._is_copy = msg - def _validate_dtype(self, dtype): """ validate the passed dtype """ @@ -5595,49 +5563,6 @@ def get_dtype_counts(self): return Series(self._data.get_dtype_counts()) - def get_ftype_counts(self): - """ - Return counts of unique ftypes in this object. - - .. deprecated:: 0.23.0 - - Returns - ------- - dtype : Series - Series with the count of columns with each type and - sparsity (dense/sparse). - - See Also - -------- - ftypes : Return ftypes (indication of sparse/dense and dtype) in - this object. - - Examples - -------- - >>> a = [['a', 1, 1.0], ['b', 2, 2.0], ['c', 3, 3.0]] - >>> df = pd.DataFrame(a, columns=['str', 'int', 'float']) - >>> df - str int float - 0 a 1 1.0 - 1 b 2 2.0 - 2 c 3 3.0 - - >>> df.get_ftype_counts() # doctest: +SKIP - float64:dense 1 - int64:dense 1 - object:dense 1 - dtype: int64 - """ - warnings.warn( - "get_ftype_counts is deprecated and will be removed in a future version", - FutureWarning, - stacklevel=2, - ) - - from pandas import Series - - return Series(self._data.get_ftype_counts()) - @property def dtypes(self): """ @@ -7526,208 +7451,6 @@ def clip(self, lower=None, upper=None, axis=None, inplace=False, *args, **kwargs return result - def clip_upper(self, threshold, axis=None, inplace=False): - """ - Trim values above a given threshold. - - .. deprecated:: 0.24.0 - Use clip(upper=threshold) instead. - - Elements above the `threshold` will be changed to match the - `threshold` value(s). Threshold can be a single value or an array, - in the latter case it performs the truncation element-wise. - - Parameters - ---------- - threshold : numeric or array-like - Maximum value allowed. All values above threshold will be set to - this value. - - * float : every value is compared to `threshold`. - * array-like : The shape of `threshold` should match the object - it's compared to. When `self` is a Series, `threshold` should be - the length. When `self` is a DataFrame, `threshold` should 2-D - and the same shape as `self` for ``axis=None``, or 1-D and the - same length as the axis being compared. - - axis : {0 or 'index', 1 or 'columns'}, default 0 - Align object with `threshold` along the given axis. - inplace : bool, default False - Whether to perform the operation in place on the data. - - .. versionadded:: 0.21.0 - - Returns - ------- - Series or DataFrame - Original data with values trimmed. - - See Also - -------- - Series.clip : General purpose method to trim Series values to given - threshold(s). - DataFrame.clip : General purpose method to trim DataFrame values to - given threshold(s). - - Examples - -------- - >>> s = pd.Series([1, 2, 3, 4, 5]) - >>> s - 0 1 - 1 2 - 2 3 - 3 4 - 4 5 - dtype: int64 - - >>> s.clip(upper=3) - 0 1 - 1 2 - 2 3 - 3 3 - 4 3 - dtype: int64 - - >>> elemwise_thresholds = [5, 4, 3, 2, 1] - >>> elemwise_thresholds - [5, 4, 3, 2, 1] - - >>> s.clip(upper=elemwise_thresholds) - 0 1 - 1 2 - 2 3 - 3 2 - 4 1 - dtype: int64 - """ - warnings.warn( - "clip_upper(threshold) is deprecated, use clip(upper=threshold) instead", - FutureWarning, - stacklevel=2, - ) - return self._clip_with_one_bound( - threshold, method=self.le, axis=axis, inplace=inplace - ) - - def clip_lower(self, threshold, axis=None, inplace=False): - """ - Trim values below a given threshold. - - .. deprecated:: 0.24.0 - Use clip(lower=threshold) instead. - - Elements below the `threshold` will be changed to match the - `threshold` value(s). Threshold can be a single value or an array, - in the latter case it performs the truncation element-wise. - - Parameters - ---------- - threshold : numeric or array-like - Minimum value allowed. All values below threshold will be set to - this value. - - * float : every value is compared to `threshold`. - * array-like : The shape of `threshold` should match the object - it's compared to. When `self` is a Series, `threshold` should be - the length. When `self` is a DataFrame, `threshold` should 2-D - and the same shape as `self` for ``axis=None``, or 1-D and the - same length as the axis being compared. - - axis : {0 or 'index', 1 or 'columns'}, default 0 - Align `self` with `threshold` along the given axis. - - inplace : bool, default False - Whether to perform the operation in place on the data. - - .. versionadded:: 0.21.0 - - Returns - ------- - Series or DataFrame - Original data with values trimmed. - - See Also - -------- - Series.clip : General purpose method to trim Series values to given - threshold(s). - DataFrame.clip : General purpose method to trim DataFrame values to - given threshold(s). - - Examples - -------- - - Series single threshold clipping: - - >>> s = pd.Series([5, 6, 7, 8, 9]) - >>> s.clip(lower=8) - 0 8 - 1 8 - 2 8 - 3 8 - 4 9 - dtype: int64 - - Series clipping element-wise using an array of thresholds. `threshold` - should be the same length as the Series. - - >>> elemwise_thresholds = [4, 8, 7, 2, 5] - >>> s.clip(lower=elemwise_thresholds) - 0 5 - 1 8 - 2 7 - 3 8 - 4 9 - dtype: int64 - - DataFrames can be compared to a scalar. - - >>> df = pd.DataFrame({"A": [1, 3, 5], "B": [2, 4, 6]}) - >>> df - A B - 0 1 2 - 1 3 4 - 2 5 6 - - >>> df.clip(lower=3) - A B - 0 3 3 - 1 3 4 - 2 5 6 - - Or to an array of values. By default, `threshold` should be the same - shape as the DataFrame. - - >>> df.clip(lower=np.array([[3, 4], [2, 2], [6, 2]])) - A B - 0 3 4 - 1 3 4 - 2 6 6 - - Control how `threshold` is broadcast with `axis`. In this case - `threshold` should be the same length as the axis specified by - `axis`. - - >>> df.clip(lower=[3, 3, 5], axis='index') - A B - 0 3 3 - 1 3 4 - 2 5 6 - - >>> df.clip(lower=[4, 5], axis='columns') - A B - 0 4 5 - 1 4 5 - 2 5 6 - """ - warnings.warn( - "clip_lower(threshold) is deprecated, use clip(lower=threshold) instead", - FutureWarning, - stacklevel=2, - ) - return self._clip_with_one_bound( - threshold, method=self.ge, axis=axis, inplace=inplace - ) - def groupby( self, by=None, diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index c36dd9463c61d..d53fbe2e60e9a 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -79,7 +79,6 @@ class BlockManager(PandasObject): copy(deep=True) get_dtype_counts - get_ftype_counts get_dtypes get_ftypes @@ -246,9 +245,6 @@ def _get_counts(self, f): def get_dtype_counts(self): return self._get_counts(lambda b: b.dtype.name) - def get_ftype_counts(self): - return self._get_counts(lambda b: b.ftype) - def get_dtypes(self): dtypes = np.array([blk.dtype for blk in self.blocks]) return algos.take_1d(dtypes, self._blknos, allow_fill=False) @@ -1555,9 +1551,6 @@ def ftype(self): def get_dtype_counts(self): return {self.dtype.name: 1} - def get_ftype_counts(self): - return {self.ftype: 1} - def get_dtypes(self): return np.array([self._block.dtype]) diff --git a/pandas/core/series.py b/pandas/core/series.py index c10871d04ef3e..a950b4496baa7 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -4615,26 +4615,6 @@ def dropna(self, axis=0, inplace=False, how=None): else: return self.copy() - def valid(self, inplace=False, **kwargs): - """ - Return Series without null values. - - .. deprecated:: 0.23.0 - Use :meth:`Series.dropna` instead. - - Returns - ------- - Series - Series without null values. - """ - warnings.warn( - "Method .valid will be removed in a future version. " - "Use .dropna instead.", - FutureWarning, - stacklevel=2, - ) - return self.dropna(inplace=inplace, **kwargs) - # ---------------------------------------------------------------------- # Time series-oriented methods diff --git a/pandas/tests/frame/test_analytics.py b/pandas/tests/frame/test_analytics.py index 9cc9c5dc697b6..005ca8d95182e 100644 --- a/pandas/tests/frame/test_analytics.py +++ b/pandas/tests/frame/test_analytics.py @@ -2279,14 +2279,6 @@ def test_clip(self, float_frame): median = float_frame.median().median() original = float_frame.copy() - with tm.assert_produces_warning(FutureWarning): - capped = float_frame.clip_upper(median) - assert not (capped.values > median).any() - - with tm.assert_produces_warning(FutureWarning): - floored = float_frame.clip_lower(median) - assert not (floored.values < median).any() - double = float_frame.clip(upper=median, lower=median) assert not (double.values != median).any() @@ -2298,16 +2290,6 @@ def test_inplace_clip(self, float_frame): median = float_frame.median().median() frame_copy = float_frame.copy() - with tm.assert_produces_warning(FutureWarning): - frame_copy.clip_upper(median, inplace=True) - assert not (frame_copy.values > median).any() - frame_copy = float_frame.copy() - - with tm.assert_produces_warning(FutureWarning): - frame_copy.clip_lower(median, inplace=True) - assert not (frame_copy.values < median).any() - frame_copy = float_frame.copy() - frame_copy.clip(upper=median, lower=median, inplace=True) assert not (frame_copy.values != median).any() @@ -2759,8 +2741,7 @@ def test_series_broadcasting(self): s_nan = Series([np.nan, np.nan, 1]) with tm.assert_produces_warning(None): - with tm.assert_produces_warning(FutureWarning): - df_nan.clip_lower(s, axis=0) + df_nan.clip(lower=s, axis=0) for op in ["lt", "le", "gt", "ge", "eq", "ne"]: getattr(df, op)(s_nan, axis=0) diff --git a/pandas/tests/generic/test_series.py b/pandas/tests/generic/test_series.py index ae452e6faef01..096a5aa99bd80 100644 --- a/pandas/tests/generic/test_series.py +++ b/pandas/tests/generic/test_series.py @@ -243,11 +243,6 @@ def test_to_xarray(self): assert isinstance(result, DataArray) tm.assert_series_equal(result.to_series(), s) - def test_valid_deprecated(self): - # GH18800 - with tm.assert_produces_warning(FutureWarning): - pd.Series([]).valid() - @pytest.mark.parametrize( "s", [ diff --git a/pandas/tests/indexing/test_chaining_and_caching.py b/pandas/tests/indexing/test_chaining_and_caching.py index 274b72b0561a9..6e26d407ab0ec 100644 --- a/pandas/tests/indexing/test_chaining_and_caching.py +++ b/pandas/tests/indexing/test_chaining_and_caching.py @@ -393,14 +393,3 @@ def test_cache_updating(self): tm.assert_frame_equal(df, expected) expected = Series([0, 0, 0, 2, 0], name="f") tm.assert_series_equal(df.f, expected) - - def test_deprecate_is_copy(self): - # GH18801 - df = DataFrame({"A": [1, 2, 3]}) - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - # getter - df.is_copy - - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - # setter - df.is_copy = "test deprecated is_copy" diff --git a/pandas/tests/series/test_analytics.py b/pandas/tests/series/test_analytics.py index 79eaeaf051d2e..e25c4456147f7 100644 --- a/pandas/tests/series/test_analytics.py +++ b/pandas/tests/series/test_analytics.py @@ -655,11 +655,6 @@ def test_matmul(self): def test_clip(self, datetime_series): val = datetime_series.median() - with tm.assert_produces_warning(FutureWarning): - assert datetime_series.clip_lower(val).min() == val - with tm.assert_produces_warning(FutureWarning): - assert datetime_series.clip_upper(val).max() == val - assert datetime_series.clip(lower=val).min() == val assert datetime_series.clip(upper=val).max() == val @@ -678,10 +673,8 @@ def test_clip_types_and_nulls(self): for s in sers: thresh = s[2] - with tm.assert_produces_warning(FutureWarning): - lower = s.clip_lower(thresh) - with tm.assert_produces_warning(FutureWarning): - upper = s.clip_upper(thresh) + lower = s.clip(lower=thresh) + upper = s.clip(upper=thresh) assert lower[notna(lower)].min() == thresh assert upper[notna(upper)].max() == thresh assert list(isna(s)) == list(isna(lower)) @@ -703,12 +696,6 @@ def test_clip_against_series(self): # GH #6966 s = Series([1.0, 1.0, 4.0]) - threshold = Series([1.0, 2.0, 3.0]) - - with tm.assert_produces_warning(FutureWarning): - tm.assert_series_equal(s.clip_lower(threshold), Series([1.0, 2.0, 4.0])) - with tm.assert_produces_warning(FutureWarning): - tm.assert_series_equal(s.clip_upper(threshold), Series([1.0, 1.0, 3.0])) lower = Series([1.0, 2.0, 3.0]) upper = Series([1.5, 2.5, 3.5]) diff --git a/pandas/tests/series/test_dtypes.py b/pandas/tests/series/test_dtypes.py index e1ace952f722d..ec0318b2af13a 100644 --- a/pandas/tests/series/test_dtypes.py +++ b/pandas/tests/series/test_dtypes.py @@ -56,11 +56,6 @@ def test_dtype(self, datetime_series): # GH 26705 - Assert .ftypes is deprecated with tm.assert_produces_warning(FutureWarning): assert datetime_series.ftypes == "float64:dense" - # GH18243 - Assert .get_ftype_counts is deprecated - with tm.assert_produces_warning(FutureWarning): - tm.assert_series_equal( - datetime_series.get_ftype_counts(), Series(1, ["float64:dense"]) - ) @pytest.mark.parametrize("value", [np.nan, np.inf]) @pytest.mark.parametrize("dtype", [np.int32, np.int64]) From c0050e071d0cc31501d4f730c27a81f3a126fc5c Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Thu, 21 Nov 2019 05:31:19 -0800 Subject: [PATCH 154/185] REF: make Fixed.version a property (#29765) --- pandas/io/pytables.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index b2d742fe05141..9a1bfdd2be798 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -9,7 +9,7 @@ import os import re import time -from typing import TYPE_CHECKING, Any, Dict, List, Optional, Type, Union +from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Type, Union import warnings import numpy as np @@ -2558,21 +2558,22 @@ def __init__(self, parent, group, encoding=None, errors="strict", **kwargs): self.group = group self.encoding = _ensure_encoding(encoding) self.errors = errors - self.set_version() @property def is_old_version(self) -> bool: return self.version[0] <= 0 and self.version[1] <= 10 and self.version[2] < 1 - def set_version(self): + @property + def version(self) -> Tuple[int, int, int]: """ compute and set our version """ version = _ensure_decoded(getattr(self.group._v_attrs, "pandas_version", None)) try: - self.version = tuple(int(x) for x in version.split(".")) - if len(self.version) == 2: - self.version = self.version + (0,) + version = tuple(int(x) for x in version.split(".")) + if len(version) == 2: + version = version + (0,) except AttributeError: - self.version = (0, 0, 0) + version = (0, 0, 0) + return version @property def pandas_type(self): @@ -2598,7 +2599,6 @@ def set_object_info(self): """ set my pandas type & version """ self.attrs.pandas_type = str(self.pandas_kind) self.attrs.pandas_version = str(_version) - self.set_version() def copy(self): new_self = copy.copy(self) From 2570c1d9c08e3e072c62f8ac548e1d88ffaee3ed Mon Sep 17 00:00:00 2001 From: Terji Petersen Date: Thu, 21 Nov 2019 13:39:54 +0000 Subject: [PATCH 155/185] PERF: speed-up when scalar not found in Categorical's categories (#29750) --- doc/source/whatsnew/v1.0.0.rst | 1 + pandas/core/arrays/categorical.py | 4 ++-- pandas/tests/arrays/categorical/test_operators.py | 2 +- pandas/tests/indexes/interval/test_interval.py | 4 ++-- pandas/tests/indexes/test_base.py | 2 +- 5 files changed, 7 insertions(+), 6 deletions(-) diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index 70c30eb42169b..54640ff576338 100644 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -357,6 +357,7 @@ Performance improvements - Performance improvement in :meth:`DataFrame.replace` when provided a list of values to replace (:issue:`28099`) - Performance improvement in :meth:`DataFrame.select_dtypes` by using vectorization instead of iterating over a loop (:issue:`28317`) - Performance improvement in :meth:`Categorical.searchsorted` and :meth:`CategoricalIndex.searchsorted` (:issue:`28795`) +- Performance improvement when searching for a scalar in a :meth:`Categorical` and the scalar is not found in the categories (:issue:`29750`) .. _whatsnew_1000.bug_fixes: diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 9a94345a769df..85688a394ebda 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -133,9 +133,9 @@ def func(self, other): return ret else: if opname == "__eq__": - return np.repeat(False, len(self)) + return np.zeros(len(self), dtype=bool) elif opname == "__ne__": - return np.repeat(True, len(self)) + return np.ones(len(self), dtype=bool) else: raise TypeError( f"Cannot compare a Categorical for op {opname} with a " diff --git a/pandas/tests/arrays/categorical/test_operators.py b/pandas/tests/arrays/categorical/test_operators.py index 22c1d5373372a..d62c4f4cf936e 100644 --- a/pandas/tests/arrays/categorical/test_operators.py +++ b/pandas/tests/arrays/categorical/test_operators.py @@ -48,7 +48,7 @@ def test_comparisons(self): tm.assert_numpy_array_equal(result, expected) result = self.factor == "d" - expected = np.repeat(False, len(self.factor)) + expected = np.zeros(len(self.factor), dtype=bool) tm.assert_numpy_array_equal(result, expected) # comparisons with categoricals diff --git a/pandas/tests/indexes/interval/test_interval.py b/pandas/tests/indexes/interval/test_interval.py index 73eacd8c4856e..f3c8c5cb6efa1 100644 --- a/pandas/tests/indexes/interval/test_interval.py +++ b/pandas/tests/indexes/interval/test_interval.py @@ -105,11 +105,11 @@ def test_with_nans(self, closed): assert index.hasnans is False result = index.isna() - expected = np.repeat(False, len(index)) + expected = np.zeros(len(index), dtype=bool) tm.assert_numpy_array_equal(result, expected) result = index.notna() - expected = np.repeat(True, len(index)) + expected = np.ones(len(index), dtype=bool) tm.assert_numpy_array_equal(result, expected) index = self.create_index_with_nan(closed=closed) diff --git a/pandas/tests/indexes/test_base.py b/pandas/tests/indexes/test_base.py index 5bfa13c0865f1..facc025409f08 100644 --- a/pandas/tests/indexes/test_base.py +++ b/pandas/tests/indexes/test_base.py @@ -730,7 +730,7 @@ def test_nanosecond_index_access(self): assert first_value == x[Timestamp(expected_ts)] def test_booleanindex(self, index): - bool_index = np.repeat(True, len(index)).astype(bool) + bool_index = np.ones(len(index), dtype=bool) bool_index[5:30:2] = False sub_index = index[bool_index] From 555e6e1cbb852b2b0c2cd13f6f575656254160f6 Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Thu, 21 Nov 2019 16:27:39 +0000 Subject: [PATCH 156/185] TYP: disallow comment-based annotation syntax (#29741) --- ci/code_checks.sh | 4 +++ doc/source/development/contributing.rst | 10 +++--- pandas/_config/config.py | 8 ++--- pandas/_version.py | 2 +- pandas/compat/numpy/function.py | 24 ++++++------- pandas/core/accessor.py | 4 +-- pandas/core/algorithms.py | 2 +- pandas/core/apply.py | 3 +- pandas/core/arrays/base.py | 2 +- pandas/core/arrays/datetimelike.py | 2 +- pandas/core/arrays/datetimes.py | 2 +- pandas/core/arrays/integer.py | 4 +-- pandas/core/arrays/period.py | 6 ++-- pandas/core/arrays/timedeltas.py | 4 +-- pandas/core/base.py | 8 ++--- pandas/core/computation/expr.py | 2 +- pandas/core/dtypes/base.py | 2 +- pandas/core/dtypes/dtypes.py | 36 ++++++++++---------- pandas/core/frame.py | 6 ++-- pandas/core/generic.py | 20 +++++------ pandas/core/groupby/generic.py | 4 +-- pandas/core/groupby/groupby.py | 7 ++-- pandas/core/groupby/grouper.py | 11 +++--- pandas/core/groupby/ops.py | 8 ++--- pandas/core/indexes/base.py | 16 ++++----- pandas/core/indexes/datetimelike.py | 4 +-- pandas/core/indexes/range.py | 8 ++--- pandas/core/indexing.py | 2 +- pandas/core/internals/managers.py | 2 +- pandas/core/nanops.py | 2 +- pandas/core/ops/docstrings.py | 4 +-- pandas/core/resample.py | 2 +- pandas/core/reshape/merge.py | 3 +- pandas/core/reshape/pivot.py | 11 +++--- pandas/core/series.py | 9 ++--- pandas/core/strings.py | 4 +-- pandas/core/window/rolling.py | 8 ++--- pandas/io/common.py | 2 +- pandas/io/excel/_odfreader.py | 4 +-- pandas/io/excel/_openpyxl.py | 2 +- pandas/io/formats/format.py | 26 ++++++++------ pandas/io/formats/html.py | 9 +++-- pandas/io/formats/latex.py | 2 +- pandas/io/formats/printing.py | 2 +- pandas/io/json/_json.py | 4 ++- pandas/io/json/_normalize.py | 4 +-- pandas/io/parsers.py | 4 +-- pandas/io/pytables.py | 20 +++++------ pandas/plotting/_matplotlib/core.py | 2 +- pandas/tests/api/test_api.py | 10 +++--- pandas/tests/arrays/test_datetimelike.py | 2 +- pandas/tests/computation/test_eval.py | 4 +-- pandas/tests/dtypes/test_common.py | 6 ++-- pandas/tests/extension/base/ops.py | 8 ++--- pandas/tests/indexes/common.py | 2 +- pandas/tests/indexing/test_coercion.py | 2 +- pandas/tests/io/parser/conftest.py | 4 +-- pandas/tests/io/test_sql.py | 4 +-- pandas/tests/tseries/offsets/test_offsets.py | 12 +++---- pandas/tseries/frequencies.py | 2 +- pandas/tseries/holiday.py | 2 +- pandas/tseries/offsets.py | 4 +-- pandas/util/_decorators.py | 4 ++- 63 files changed, 209 insertions(+), 194 deletions(-) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index edd8fcd418c47..7c6c98d910492 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -194,6 +194,10 @@ if [[ -z "$CHECK" || "$CHECK" == "patterns" ]]; then invgrep -R --include="*.py" --include="*.pyx" -E 'class.*:\n\n( )+"""' . RET=$(($RET + $?)) ; echo $MSG "DONE" + MSG='Check for use of comment-based annotation syntax' ; echo $MSG + invgrep -R --include="*.py" -P '# type: (?!ignore)' pandas + RET=$(($RET + $?)) ; echo $MSG "DONE" + MSG='Check that no file in the repo contains trailing whitespaces' ; echo $MSG set -o pipefail if [[ "$AZURE" == "true" ]]; then diff --git a/doc/source/development/contributing.rst b/doc/source/development/contributing.rst index 33084d0d23771..042d6926d84f5 100644 --- a/doc/source/development/contributing.rst +++ b/doc/source/development/contributing.rst @@ -804,7 +804,7 @@ Types imports should follow the ``from typing import ...`` convention. So rather import typing - primes = [] # type: typing.List[int] + primes: typing.List[int] = [] You should write @@ -812,19 +812,19 @@ You should write from typing import List, Optional, Union - primes = [] # type: List[int] + primes: List[int] = [] ``Optional`` should be used where applicable, so instead of .. code-block:: python - maybe_primes = [] # type: List[Union[int, None]] + maybe_primes: List[Union[int, None]] = [] You should write .. code-block:: python - maybe_primes = [] # type: List[Optional[int]] + maybe_primes: List[Optional[int]] = [] In some cases in the code base classes may define class variables that shadow builtins. This causes an issue as described in `Mypy 1775 `_. The defensive solution here is to create an unambiguous alias of the builtin and use that without your annotation. For example, if you come across a definition like @@ -840,7 +840,7 @@ The appropriate way to annotate this would be as follows str_type = str class SomeClass2: - str = None # type: str_type + str: str_type = None In some cases you may be tempted to use ``cast`` from the typing module when you know better than the analyzer. This occurs particularly when using custom inference functions. For example diff --git a/pandas/_config/config.py b/pandas/_config/config.py index 890db5b41907e..814f855cceeac 100644 --- a/pandas/_config/config.py +++ b/pandas/_config/config.py @@ -58,16 +58,16 @@ RegisteredOption = namedtuple("RegisteredOption", "key defval doc validator cb") # holds deprecated option metdata -_deprecated_options = {} # type: Dict[str, DeprecatedOption] +_deprecated_options: Dict[str, DeprecatedOption] = {} # holds registered option metdata -_registered_options = {} # type: Dict[str, RegisteredOption] +_registered_options: Dict[str, RegisteredOption] = {} # holds the current values for registered options -_global_config = {} # type: Dict[str, str] +_global_config: Dict[str, str] = {} # keys which have a special meaning -_reserved_keys = ["all"] # type: List[str] +_reserved_keys: List[str] = ["all"] class OptionError(AttributeError, KeyError): diff --git a/pandas/_version.py b/pandas/_version.py index 0cdedf3da3ea7..dfed9574c7cb0 100644 --- a/pandas/_version.py +++ b/pandas/_version.py @@ -47,7 +47,7 @@ class NotThisMethod(Exception): pass -HANDLERS = {} # type: Dict[str, Dict[str, Callable]] +HANDLERS: Dict[str, Dict[str, Callable]] = {} def register_vcs_handler(vcs: str, method: str) -> Callable: # decorator diff --git a/pandas/compat/numpy/function.py b/pandas/compat/numpy/function.py index ea5aaf6b6476d..fffe09a74571e 100644 --- a/pandas/compat/numpy/function.py +++ b/pandas/compat/numpy/function.py @@ -106,7 +106,7 @@ def validate_argmax_with_skipna(skipna, args, kwargs): return skipna -ARGSORT_DEFAULTS = OrderedDict() # type: OrderedDict[str, Optional[Union[int, str]]] +ARGSORT_DEFAULTS: "OrderedDict[str, Optional[Union[int, str]]]" = OrderedDict() ARGSORT_DEFAULTS["axis"] = -1 ARGSORT_DEFAULTS["kind"] = "quicksort" ARGSORT_DEFAULTS["order"] = None @@ -122,7 +122,7 @@ def validate_argmax_with_skipna(skipna, args, kwargs): # two different signatures of argsort, this second validation # for when the `kind` param is supported -ARGSORT_DEFAULTS_KIND = OrderedDict() # type: OrderedDict[str, Optional[int]] +ARGSORT_DEFAULTS_KIND: "OrderedDict[str, Optional[int]]" = OrderedDict() ARGSORT_DEFAULTS_KIND["axis"] = -1 ARGSORT_DEFAULTS_KIND["order"] = None validate_argsort_kind = CompatValidator( @@ -169,14 +169,14 @@ def validate_clip_with_axis(axis, args, kwargs): return axis -COMPRESS_DEFAULTS = OrderedDict() # type: OrderedDict[str, Any] +COMPRESS_DEFAULTS: "OrderedDict[str, Any]" = OrderedDict() COMPRESS_DEFAULTS["axis"] = None COMPRESS_DEFAULTS["out"] = None validate_compress = CompatValidator( COMPRESS_DEFAULTS, fname="compress", method="both", max_fname_arg_count=1 ) -CUM_FUNC_DEFAULTS = OrderedDict() # type: OrderedDict[str, Any] +CUM_FUNC_DEFAULTS: "OrderedDict[str, Any]" = OrderedDict() CUM_FUNC_DEFAULTS["dtype"] = None CUM_FUNC_DEFAULTS["out"] = None validate_cum_func = CompatValidator( @@ -202,7 +202,7 @@ def validate_cum_func_with_skipna(skipna, args, kwargs, name): return skipna -ALLANY_DEFAULTS = OrderedDict() # type: OrderedDict[str, Optional[bool]] +ALLANY_DEFAULTS: "OrderedDict[str, Optional[bool]]" = OrderedDict() ALLANY_DEFAULTS["dtype"] = None ALLANY_DEFAULTS["out"] = None ALLANY_DEFAULTS["keepdims"] = False @@ -224,28 +224,28 @@ def validate_cum_func_with_skipna(skipna, args, kwargs, name): MINMAX_DEFAULTS, fname="max", method="both", max_fname_arg_count=1 ) -RESHAPE_DEFAULTS = dict(order="C") # type: Dict[str, str] +RESHAPE_DEFAULTS: Dict[str, str] = dict(order="C") validate_reshape = CompatValidator( RESHAPE_DEFAULTS, fname="reshape", method="both", max_fname_arg_count=1 ) -REPEAT_DEFAULTS = dict(axis=None) # type: Dict[str, Any] +REPEAT_DEFAULTS: Dict[str, Any] = dict(axis=None) validate_repeat = CompatValidator( REPEAT_DEFAULTS, fname="repeat", method="both", max_fname_arg_count=1 ) -ROUND_DEFAULTS = dict(out=None) # type: Dict[str, Any] +ROUND_DEFAULTS: Dict[str, Any] = dict(out=None) validate_round = CompatValidator( ROUND_DEFAULTS, fname="round", method="both", max_fname_arg_count=1 ) -SORT_DEFAULTS = OrderedDict() # type: OrderedDict[str, Optional[Union[int, str]]] +SORT_DEFAULTS: "OrderedDict[str, Optional[Union[int, str]]]" = OrderedDict() SORT_DEFAULTS["axis"] = -1 SORT_DEFAULTS["kind"] = "quicksort" SORT_DEFAULTS["order"] = None validate_sort = CompatValidator(SORT_DEFAULTS, fname="sort", method="kwargs") -STAT_FUNC_DEFAULTS = OrderedDict() # type: OrderedDict[str, Optional[Any]] +STAT_FUNC_DEFAULTS: "OrderedDict[str, Optional[Any]]" = OrderedDict() STAT_FUNC_DEFAULTS["dtype"] = None STAT_FUNC_DEFAULTS["out"] = None @@ -273,13 +273,13 @@ def validate_cum_func_with_skipna(skipna, args, kwargs, name): MEDIAN_DEFAULTS, fname="median", method="both", max_fname_arg_count=1 ) -STAT_DDOF_FUNC_DEFAULTS = OrderedDict() # type: OrderedDict[str, Optional[bool]] +STAT_DDOF_FUNC_DEFAULTS: "OrderedDict[str, Optional[bool]]" = OrderedDict() STAT_DDOF_FUNC_DEFAULTS["dtype"] = None STAT_DDOF_FUNC_DEFAULTS["out"] = None STAT_DDOF_FUNC_DEFAULTS["keepdims"] = False validate_stat_ddof_func = CompatValidator(STAT_DDOF_FUNC_DEFAULTS, method="kwargs") -TAKE_DEFAULTS = OrderedDict() # type: OrderedDict[str, Optional[str]] +TAKE_DEFAULTS: "OrderedDict[str, Optional[str]]" = OrderedDict() TAKE_DEFAULTS["out"] = None TAKE_DEFAULTS["mode"] = "raise" validate_take = CompatValidator(TAKE_DEFAULTS, fname="take", method="kwargs") diff --git a/pandas/core/accessor.py b/pandas/core/accessor.py index fc60c01d7b808..182b07d57ea49 100644 --- a/pandas/core/accessor.py +++ b/pandas/core/accessor.py @@ -11,8 +11,8 @@ class DirNamesMixin: - _accessors = set() # type: Set[str] - _deprecations = frozenset() # type: FrozenSet[str] + _accessors: Set[str] = set() + _deprecations: FrozenSet[str] = frozenset() def _dir_deletions(self): """ diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 9c14102529b48..18adb12a9ad72 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -50,7 +50,7 @@ from pandas.core.construction import array, extract_array from pandas.core.indexers import validate_indices -_shared_docs = {} # type: Dict[str, str] +_shared_docs: Dict[str, str] = {} # --------------- # diff --git a/pandas/core/apply.py b/pandas/core/apply.py index 34a8ed1fa7a83..8c49b2b803241 100644 --- a/pandas/core/apply.py +++ b/pandas/core/apply.py @@ -34,8 +34,9 @@ def frame_apply( """ construct and return a row or column based frame apply object """ axis = obj._get_axis_number(axis) + klass: Type[FrameApply] if axis == 0: - klass = FrameRowApply # type: Type[FrameApply] + klass = FrameRowApply elif axis == 1: klass = FrameColumnApply diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index 82dabe735581b..fa0e025c22c88 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -29,7 +29,7 @@ _not_implemented_message = "{} does not implement {}." -_extension_array_shared_docs = dict() # type: Dict[str, str] +_extension_array_shared_docs: Dict[str, str] = dict() def try_cast_to_ea(cls_or_instance, obj, dtype=None): diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index 8e66db4c61032..dc3c49b7e06a9 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -51,7 +51,7 @@ class AttributesMixin: - _data = None # type: np.ndarray + _data: np.ndarray @classmethod def _simple_new(cls, values, **kwargs): diff --git a/pandas/core/arrays/datetimes.py b/pandas/core/arrays/datetimes.py index 8e3c727a14c99..71420e6e58090 100644 --- a/pandas/core/arrays/datetimes.py +++ b/pandas/core/arrays/datetimes.py @@ -320,7 +320,7 @@ class DatetimeArray(dtl.DatetimeLikeArrayMixin, dtl.TimelikeOps, dtl.DatelikeOps # ----------------------------------------------------------------- # Constructors - _dtype = None # type: Union[np.dtype, DatetimeTZDtype] + _dtype: Union[np.dtype, DatetimeTZDtype] _freq = None def __init__(self, values, dtype=_NS_DTYPE, freq=None, copy=False): diff --git a/pandas/core/arrays/integer.py b/pandas/core/arrays/integer.py index 63296b4a26354..12b76df9a5983 100644 --- a/pandas/core/arrays/integer.py +++ b/pandas/core/arrays/integer.py @@ -40,9 +40,9 @@ class _IntegerDtype(ExtensionDtype): The attributes name & type are set when these subclasses are created. """ - name = None # type: str + name: str base = None - type = None # type: Type + type: Type na_value = np.nan def __repr__(self) -> str: diff --git a/pandas/core/arrays/period.py b/pandas/core/arrays/period.py index f3d51b28ad399..41a8c48452647 100644 --- a/pandas/core/arrays/period.py +++ b/pandas/core/arrays/period.py @@ -161,7 +161,7 @@ class PeriodArray(dtl.DatetimeLikeArrayMixin, dtl.DatelikeOps): _scalar_type = Period # Names others delegate to us - _other_ops = [] # type: List[str] + _other_ops: List[str] = [] _bool_ops = ["is_leap_year"] _object_ops = ["start_time", "end_time", "freq"] _field_ops = [ @@ -894,9 +894,9 @@ def period_array( data = np.asarray(data) + dtype: Optional[PeriodDtype] if freq: - # typed Optional here because the else block below assigns None - dtype = PeriodDtype(freq) # type: Optional[PeriodDtype] + dtype = PeriodDtype(freq) else: dtype = None diff --git a/pandas/core/arrays/timedeltas.py b/pandas/core/arrays/timedeltas.py index 816beb758dd33..bacd0b9699e93 100644 --- a/pandas/core/arrays/timedeltas.py +++ b/pandas/core/arrays/timedeltas.py @@ -161,8 +161,8 @@ class TimedeltaArray(dtl.DatetimeLikeArrayMixin, dtl.TimelikeOps): _scalar_type = Timedelta __array_priority__ = 1000 # define my properties & methods for delegation - _other_ops = [] # type: List[str] - _bool_ops = [] # type: List[str] + _other_ops: List[str] = [] + _bool_ops: List[str] = [] _object_ops = ["freq"] _field_ops = ["days", "seconds", "microseconds", "nanoseconds"] _datetimelike_ops = _field_ops + _object_ops + _bool_ops diff --git a/pandas/core/base.py b/pandas/core/base.py index c9855701eeb03..176a92132e20a 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -36,7 +36,7 @@ from pandas.core.arrays import ExtensionArray import pandas.core.nanops as nanops -_shared_docs = dict() # type: Dict[str, str] +_shared_docs: Dict[str, str] = dict() _indexops_doc_kwargs = dict( klass="IndexOpsMixin", inplace="", @@ -603,7 +603,7 @@ def _is_builtin_func(self, arg): class ShallowMixin: - _attributes = [] # type: List[str] + _attributes: List[str] = [] def _shallow_copy(self, obj=None, **kwargs): """ @@ -627,7 +627,7 @@ class IndexOpsMixin: # ndarray compatibility __array_priority__ = 1000 - _deprecations = frozenset( + _deprecations: FrozenSet[str] = frozenset( [ "tolist", # tolist is not deprecated, just suppressed in the __dir__ "base", @@ -637,7 +637,7 @@ class IndexOpsMixin: "flags", "strides", ] - ) # type: FrozenSet[str] + ) def transpose(self, *args, **kwargs): """ diff --git a/pandas/core/computation/expr.py b/pandas/core/computation/expr.py index 4d1fc42070ea8..253d64d50d0cd 100644 --- a/pandas/core/computation/expr.py +++ b/pandas/core/computation/expr.py @@ -378,7 +378,7 @@ class BaseExprVisitor(ast.NodeVisitor): preparser : callable """ - const_type = Constant # type: Type[Term] + const_type: Type[Term] = Constant term_type = Term binary_ops = _cmp_ops_syms + _bool_ops_syms + _arith_ops_syms diff --git a/pandas/core/dtypes/base.py b/pandas/core/dtypes/base.py index c90f1cdeaabfd..8acdf32c8768e 100644 --- a/pandas/core/dtypes/base.py +++ b/pandas/core/dtypes/base.py @@ -81,7 +81,7 @@ def __from_arrow__( provided for registering virtual subclasses. """ - _metadata = () # type: Tuple[str, ...] + _metadata: Tuple[str, ...] = () def __str__(self) -> str: return self.name diff --git a/pandas/core/dtypes/dtypes.py b/pandas/core/dtypes/dtypes.py index 3d1388db371ca..523c8e8bd02d0 100644 --- a/pandas/core/dtypes/dtypes.py +++ b/pandas/core/dtypes/dtypes.py @@ -20,7 +20,7 @@ # GH26403: sentinel value used for the default value of ordered in the # CategoricalDtype constructor to detect when ordered=None is explicitly passed -ordered_sentinel = object() # type: object +ordered_sentinel: object = object() def register_extension_dtype(cls: Type[ExtensionDtype]) -> Type[ExtensionDtype]: @@ -66,7 +66,7 @@ class Registry: """ def __init__(self): - self.dtypes = [] # type: List[Type[ExtensionDtype]] + self.dtypes: List[Type[ExtensionDtype]] = [] def register(self, dtype: Type[ExtensionDtype]) -> None: """ @@ -119,21 +119,21 @@ class PandasExtensionDtype(ExtensionDtype): THIS IS NOT A REAL NUMPY DTYPE """ - type = None # type: Any - kind = None # type: Any + type: Any + kind: Any # The Any type annotations above are here only because mypy seems to have a # problem dealing with with multiple inheritance from PandasExtensionDtype # and ExtensionDtype's @properties in the subclasses below. The kind and # type variables in those subclasses are explicitly typed below. subdtype = None - str = None # type: Optional[str_type] + str: Optional[str_type] = None num = 100 - shape = tuple() # type: Tuple[int, ...] + shape: Tuple[int, ...] = tuple() itemsize = 8 base = None isbuiltin = 0 isnative = 0 - _cache = {} # type: Dict[str_type, 'PandasExtensionDtype'] + _cache: Dict[str_type, "PandasExtensionDtype"] = {} def __str__(self) -> str_type: """ @@ -214,12 +214,12 @@ class CategoricalDtype(PandasExtensionDtype, ExtensionDtype): # TODO: Document public vs. private API name = "category" - type = CategoricalDtypeType # type: Type[CategoricalDtypeType] - kind = "O" # type: str_type + type: Type[CategoricalDtypeType] = CategoricalDtypeType + kind: str_type = "O" str = "|O08" base = np.dtype("O") _metadata = ("categories", "ordered", "_ordered_from_sentinel") - _cache = {} # type: Dict[str_type, PandasExtensionDtype] + _cache: Dict[str_type, PandasExtensionDtype] = {} def __init__( self, categories=None, ordered: Union[Ordered, object] = ordered_sentinel @@ -650,15 +650,15 @@ class DatetimeTZDtype(PandasExtensionDtype): datetime64[ns, tzfile('/usr/share/zoneinfo/US/Central')] """ - type = Timestamp # type: Type[Timestamp] - kind = "M" # type: str_type + type: Type[Timestamp] = Timestamp + kind: str_type = "M" str = "|M8[ns]" num = 101 base = np.dtype("M8[ns]") na_value = NaT _metadata = ("unit", "tz") _match = re.compile(r"(datetime64|M8)\[(?P.+), (?P.+)\]") - _cache = {} # type: Dict[str_type, PandasExtensionDtype] + _cache: Dict[str_type, PandasExtensionDtype] = {} def __init__(self, unit="ns", tz=None): if isinstance(unit, DatetimeTZDtype): @@ -812,14 +812,14 @@ class PeriodDtype(PandasExtensionDtype): period[M] """ - type = Period # type: Type[Period] - kind = "O" # type: str_type + type: Type[Period] = Period + kind: str_type = "O" str = "|O08" base = np.dtype("O") num = 102 _metadata = ("freq",) _match = re.compile(r"(P|p)eriod\[(?P.+)\]") - _cache = {} # type: Dict[str_type, PandasExtensionDtype] + _cache: Dict[str_type, PandasExtensionDtype] = {} def __new__(cls, freq=None): """ @@ -972,13 +972,13 @@ class IntervalDtype(PandasExtensionDtype): """ name = "interval" - kind = None # type: Optional[str_type] + kind: Optional[str_type] = None str = "|O08" base = np.dtype("O") num = 103 _metadata = ("subtype",) _match = re.compile(r"(I|i)nterval\[(?P.+)\]") - _cache = {} # type: Dict[str_type, PandasExtensionDtype] + _cache: Dict[str_type, PandasExtensionDtype] = {} def __new__(cls, subtype=None): from pandas.core.dtypes.common import ( diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 8b31b6d503eda..46b213b25df49 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -381,9 +381,9 @@ class DataFrame(NDFrame): def _constructor(self) -> Type["DataFrame"]: return DataFrame - _constructor_sliced = Series # type: Type[Series] - _deprecations = NDFrame._deprecations | frozenset([]) # type: FrozenSet[str] - _accessors = set() # type: Set[str] + _constructor_sliced: Type[Series] = Series + _deprecations: FrozenSet[str] = NDFrame._deprecations | frozenset([]) + _accessors: Set[str] = set() @property def _constructor_expanddim(self): diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 7f83bb9e69f7a..b16a72f01c739 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -92,7 +92,7 @@ # goal is to be able to define the docs close to function, while still being # able to share -_shared_docs = dict() # type: Dict[str, str] +_shared_docs: Dict[str, str] = dict() _shared_doc_kwargs = dict( axes="keywords for axes", klass="Series/DataFrame", @@ -154,7 +154,7 @@ class NDFrame(PandasObject, SelectionMixin): copy : bool, default False """ - _internal_names = [ + _internal_names: List[str] = [ "_data", "_cacher", "_item_cache", @@ -168,15 +168,15 @@ class NDFrame(PandasObject, SelectionMixin): "_metadata", "__array_struct__", "__array_interface__", - ] # type: List[str] - _internal_names_set = set(_internal_names) # type: Set[str] - _accessors = set() # type: Set[str] - _deprecations = frozenset( + ] + _internal_names_set: Set[str] = set(_internal_names) + _accessors: Set[str] = set() + _deprecations: FrozenSet[str] = frozenset( ["get_dtype_counts", "get_values", "ftypes", "ix"] - ) # type: FrozenSet[str] - _metadata = [] # type: List[str] + ) + _metadata: List[str] = [] _is_copy = None - _data = None # type: BlockManager + _data: BlockManager _attrs: Dict[Optional[Hashable], Any] # ---------------------------------------------------------------------- @@ -3599,7 +3599,7 @@ class animal locomotion result._set_is_copy(self, copy=not result._is_view) return result - _xs = xs # type: Callable + _xs: Callable = xs def __getitem__(self, item): raise AbstractMethodError(self) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 900e11dedb8b1..99ef281e842b1 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -1105,7 +1105,7 @@ def _aggregate_frame(self, func, *args, **kwargs) -> DataFrame: axis = self.axis obj = self._obj_with_exclusions - result = OrderedDict() # type: OrderedDict + result: OrderedDict = OrderedDict() if axis != obj._info_axis_number: for name, data in self: fres = func(data, *args, **kwargs) @@ -1122,7 +1122,7 @@ def _aggregate_item_by_item(self, func, *args, **kwargs) -> DataFrame: # only for axis==0 obj = self._obj_with_exclusions - result = OrderedDict() # type: dict + result: OrderedDict = OrderedDict() cannot_agg = [] errors = None for item in obj: diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 21c085c775399..9e12ac82fb3ae 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -345,7 +345,7 @@ def _group_selection_context(groupby): class _GroupBy(PandasObject, SelectionMixin): _group_selection = None - _apply_whitelist = frozenset() # type: FrozenSet[str] + _apply_whitelist: FrozenSet[str] = frozenset() def __init__( self, @@ -2518,12 +2518,11 @@ def get_groupby( mutated: bool = False, ): + klass: Union[Type["SeriesGroupBy"], Type["DataFrameGroupBy"]] if isinstance(obj, Series): from pandas.core.groupby.generic import SeriesGroupBy - klass = ( - SeriesGroupBy - ) # type: Union[Type["SeriesGroupBy"], Type["DataFrameGroupBy"]] + klass = SeriesGroupBy elif isinstance(obj, DataFrame): from pandas.core.groupby.generic import DataFrameGroupBy diff --git a/pandas/core/groupby/grouper.py b/pandas/core/groupby/grouper.py index 2b946d1ff0a7a..308d4d1864bdd 100644 --- a/pandas/core/groupby/grouper.py +++ b/pandas/core/groupby/grouper.py @@ -93,7 +93,7 @@ class Grouper: >>> df.groupby(Grouper(level='date', freq='60s', axis=1)) """ - _attributes = ("key", "level", "freq", "axis", "sort") # type: Tuple[str, ...] + _attributes: Tuple[str, ...] = ("key", "level", "freq", "axis", "sort") def __new__(cls, *args, **kwargs): if kwargs.get("freq") is not None: @@ -373,8 +373,8 @@ def __repr__(self) -> str: def __iter__(self): return iter(self.indices) - _codes = None # type: np.ndarray - _group_index = None # type: Index + _codes: Optional[np.ndarray] = None + _group_index: Optional[Index] = None @property def ngroups(self) -> int: @@ -405,6 +405,7 @@ def result_index(self) -> Index: def group_index(self) -> Index: if self._group_index is None: self._make_codes() + assert self._group_index is not None return self._group_index def _make_codes(self) -> None: @@ -576,8 +577,8 @@ def get_grouper( else: levels = [level] * len(keys) - groupings = [] # type: List[Grouping] - exclusions = [] # type: List[Hashable] + groupings: List[Grouping] = [] + exclusions: List[Hashable] = [] # if the actual grouper should be obj[key] def is_in_axis(key) -> bool: diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index a7e0a901a5394..4780254e060e6 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -90,7 +90,7 @@ def __init__( self._filter_empty_groups = self.compressed = len(groupings) != 1 self.axis = axis - self._groupings = list(groupings) # type: List[grouper.Grouping] + self._groupings: List[grouper.Grouping] = list(groupings) self.sort = sort self.group_keys = group_keys self.mutated = mutated @@ -153,7 +153,7 @@ def apply(self, f, data: FrameOrSeries, axis: int = 0): group_keys = self._get_group_keys() result_values = None - sdata = splitter._get_sorted_data() # type: FrameOrSeries + sdata: FrameOrSeries = splitter._get_sorted_data() if sdata.ndim == 2 and np.any(sdata.dtypes.apply(is_extension_array_dtype)): # calling splitter.fast_apply will raise TypeError via apply_frame_axis0 # if we pass EA instead of ndarray @@ -551,7 +551,7 @@ def _cython_operation( if vdim == 1 and arity == 1: result = result[:, 0] - names = self._name_functions.get(how, None) # type: Optional[List[str]] + names: Optional[List[str]] = self._name_functions.get(how, None) if swapped: result = result.swapaxes(0, axis) @@ -923,7 +923,7 @@ def _chop(self, sdata: DataFrame, slice_obj: slice) -> DataFrame: def get_splitter(data: FrameOrSeries, *args, **kwargs) -> DataSplitter: if isinstance(data, Series): - klass = SeriesSplitter # type: Type[DataSplitter] + klass: Type[DataSplitter] = SeriesSplitter else: # i.e. DataFrame klass = FrameSplitter diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 8978a09825ee9..10c0f465f69da 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -205,11 +205,11 @@ class Index(IndexOpsMixin, PandasObject): """ # tolist is not actually deprecated, just suppressed in the __dir__ - _deprecations = ( + _deprecations: FrozenSet[str] = ( PandasObject._deprecations | IndexOpsMixin._deprecations | frozenset(["asobject", "contains", "dtype_str", "get_values", "set_value"]) - ) # type: FrozenSet[str] + ) # To hand over control to subclasses _join_precedence = 1 @@ -321,10 +321,9 @@ def __new__( # the DatetimeIndex construction. # Note we can pass copy=False because the .astype below # will always make a copy - result = DatetimeIndex( - data, copy=False, name=name, **kwargs - ) # type: "Index" - return result.astype(object) + return DatetimeIndex(data, copy=False, name=name, **kwargs).astype( + object + ) else: return DatetimeIndex(data, copy=copy, name=name, dtype=dtype, **kwargs) @@ -332,8 +331,9 @@ def __new__( if is_dtype_equal(_o_dtype, dtype): # Note we can pass copy=False because the .astype below # will always make a copy - result = TimedeltaIndex(data, copy=False, name=name, **kwargs) - return result.astype(object) + return TimedeltaIndex(data, copy=False, name=name, **kwargs).astype( + object + ) else: return TimedeltaIndex(data, copy=copy, name=name, dtype=dtype, **kwargs) diff --git a/pandas/core/indexes/datetimelike.py b/pandas/core/indexes/datetimelike.py index df3420ea14e24..e420cf0cb0d78 100644 --- a/pandas/core/indexes/datetimelike.py +++ b/pandas/core/indexes/datetimelike.py @@ -826,9 +826,9 @@ class DatetimelikeDelegateMixin(PandasDelegate): """ # raw_methods : dispatch methods that shouldn't be boxed in an Index - _raw_methods = set() # type: Set[str] + _raw_methods: Set[str] = set() # raw_properties : dispatch properties that shouldn't be boxed in an Index - _raw_properties = set() # type: Set[str] + _raw_properties: Set[str] = set() name = None _data: ExtensionArray diff --git a/pandas/core/indexes/range.py b/pandas/core/indexes/range.py index 6f677848b1c79..e68b340130b9b 100644 --- a/pandas/core/indexes/range.py +++ b/pandas/core/indexes/range.py @@ -1,7 +1,7 @@ from datetime import timedelta import operator from sys import getsizeof -from typing import Union +from typing import Optional, Union import warnings import numpy as np @@ -73,10 +73,10 @@ class RangeIndex(Int64Index): _typ = "rangeindex" _engine_type = libindex.Int64Engine - _range = None # type: range + _range: range # check whether self._data has been called - _cached_data = None # type: np.ndarray + _cached_data: Optional[np.ndarray] = None # -------------------------------------------------------------------- # Constructors @@ -654,7 +654,7 @@ def _concat_same_dtype(self, indexes, name): non_empty_indexes = [obj for obj in indexes if len(obj)] for obj in non_empty_indexes: - rng = obj._range # type: range + rng: range = obj._range if start is None: # This is set by the first non-empty index diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index 673764ef6a124..b52015b738c6e 100755 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -100,7 +100,7 @@ class IndexingError(Exception): class _NDFrameIndexer(_NDFrameIndexerBase): - _valid_types = None # type: str + _valid_types: str axis = None def __call__(self, axis=None): diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index d53fbe2e60e9a..5e60440f1577e 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -126,7 +126,7 @@ def __init__( do_integrity_check: bool = True, ): self.axes = [ensure_index(ax) for ax in axes] - self.blocks = tuple(blocks) # type: Tuple[Block, ...] + self.blocks: Tuple[Block, ...] = tuple(blocks) for block in blocks: if self.ndim != block.ndim: diff --git a/pandas/core/nanops.py b/pandas/core/nanops.py index 7e50348962fc5..a2a40bbf93604 100644 --- a/pandas/core/nanops.py +++ b/pandas/core/nanops.py @@ -660,7 +660,7 @@ def _get_counts_nanvar( count = np.nan d = np.nan else: - mask2 = count <= ddof # type: np.ndarray + mask2: np.ndarray = count <= ddof if mask2.any(): np.putmask(d, mask2, np.nan) np.putmask(count, mask2, np.nan) diff --git a/pandas/core/ops/docstrings.py b/pandas/core/ops/docstrings.py index 5d3f9cd92aa1a..e3db65f11a332 100644 --- a/pandas/core/ops/docstrings.py +++ b/pandas/core/ops/docstrings.py @@ -233,7 +233,7 @@ def _make_flex_doc(op_name, typ): dtype: float64 """ -_op_descriptions = { +_op_descriptions: Dict[str, Dict[str, Optional[str]]] = { # Arithmetic Operators "add": { "op": "+", @@ -310,7 +310,7 @@ def _make_flex_doc(op_name, typ): "reverse": None, "series_examples": None, }, -} # type: Dict[str, Dict[str, Optional[str]]] +} _op_names = list(_op_descriptions.keys()) for key in _op_names: diff --git a/pandas/core/resample.py b/pandas/core/resample.py index 81ec4f45ec8e1..25731c4e1c54c 100644 --- a/pandas/core/resample.py +++ b/pandas/core/resample.py @@ -31,7 +31,7 @@ from pandas.tseries.frequencies import to_offset from pandas.tseries.offsets import DateOffset, Day, Nano, Tick -_shared_docs_kwargs = dict() # type: Dict[str, str] +_shared_docs_kwargs: Dict[str, str] = dict() class Resampler(_GroupBy, ShallowMixin): diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index 4d838db6c95f6..fdd31b3b7c022 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -583,8 +583,9 @@ def __init__( self.indicator = indicator + self.indicator_name: Optional[str] if isinstance(self.indicator, str): - self.indicator_name = self.indicator # type: Optional[str] + self.indicator_name = self.indicator elif isinstance(self.indicator, bool): self.indicator_name = "_merge" if self.indicator else None else: diff --git a/pandas/core/reshape/pivot.py b/pandas/core/reshape/pivot.py index b126b6e221ccc..c7d3adece521e 100644 --- a/pandas/core/reshape/pivot.py +++ b/pandas/core/reshape/pivot.py @@ -211,10 +211,9 @@ def _add_margins( if margins_name in table.columns.get_level_values(level): raise ValueError(msg) + key: Union[str, Tuple[str, ...]] if len(rows) > 1: - key = (margins_name,) + ("",) * ( - len(rows) - 1 - ) # type: Union[str, Tuple[str, ...]] + key = (margins_name,) + ("",) * (len(rows) - 1) else: key = margins_name @@ -564,7 +563,7 @@ def crosstab( if pass_objs: common_idx = get_objs_combined_axis(pass_objs, intersect=True, sort=False) - data = {} # type: dict + data: Dict = {} data.update(zip(rownames, index)) data.update(zip(colnames, columns)) @@ -615,11 +614,11 @@ def _normalize(table, normalize, margins: bool, margins_name="All"): if margins is False: # Actual Normalizations - normalizers = { + normalizers: Dict[Union[bool, str], Callable] = { "all": lambda x: x / x.sum(axis=1).sum(axis=0), "columns": lambda x: x / x.sum(), "index": lambda x: x.div(x.sum(axis=1), axis=0), - } # type: Dict[Union[bool, str], Callable] + } normalizers[True] = normalizers["all"] diff --git a/pandas/core/series.py b/pandas/core/series.py index a950b4496baa7..6045d6a654508 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -170,7 +170,7 @@ class Series(base.IndexOpsMixin, generic.NDFrame): Copy input data. """ - _metadata = [] # type: List[str] + _metadata: List[str] = [] _accessors = {"dt", "cat", "str", "sparse"} _deprecations = ( base.IndexOpsMixin._deprecations @@ -184,7 +184,7 @@ class Series(base.IndexOpsMixin, generic.NDFrame): hasnans = property( base.IndexOpsMixin.hasnans.func, doc=base.IndexOpsMixin.hasnans.__doc__ ) - _data = None # type: SingleBlockManager + _data: SingleBlockManager # ---------------------------------------------------------------------- # Constructors @@ -781,9 +781,10 @@ def __array_ufunc__( inputs = tuple(extract_array(x, extract_numpy=True) for x in inputs) result = getattr(ufunc, method)(*inputs, **kwargs) + + name: Optional[Hashable] if len(set(names)) == 1: - # we require names to be hashable, right? - name = names[0] # type: Any + name = names[0] else: name = None diff --git a/pandas/core/strings.py b/pandas/core/strings.py index 413e7e85eb6fe..137c37f938dfa 100644 --- a/pandas/core/strings.py +++ b/pandas/core/strings.py @@ -52,7 +52,7 @@ ) _cpython_optimized_decoders = _cpython_optimized_encoders + ("utf-16", "utf-32") -_shared_docs = dict() # type: Dict[str, str] +_shared_docs: Dict[str, str] = dict() def cat_core(list_of_columns: List, sep: str): @@ -3284,7 +3284,7 @@ def rindex(self, sub, start=0, end=None): """ # _doc_args holds dict of strings to use in substituting casemethod docs - _doc_args = {} # type: Dict[str, Dict[str, str]] + _doc_args: Dict[str, Dict[str, str]] = {} _doc_args["lower"] = dict(type="lowercase", method="lower", version="") _doc_args["upper"] = dict(type="uppercase", method="upper", version="") _doc_args["title"] = dict(type="titlecase", method="title", version="") diff --git a/pandas/core/window/rolling.py b/pandas/core/window/rolling.py index fd2e8aa2ad02f..6a35664ece765 100644 --- a/pandas/core/window/rolling.py +++ b/pandas/core/window/rolling.py @@ -53,7 +53,7 @@ class _Window(PandasObject, ShallowMixin, SelectionMixin): - _attributes = [ + _attributes: List[str] = [ "window", "min_periods", "center", @@ -61,8 +61,8 @@ class _Window(PandasObject, ShallowMixin, SelectionMixin): "axis", "on", "closed", - ] # type: List[str] - exclusions = set() # type: Set[str] + ] + exclusions: Set[str] = set() def __init__( self, @@ -449,7 +449,7 @@ def _apply( window_indexer = self._get_window_indexer() results = [] - exclude = [] # type: List[Scalar] + exclude: List[Scalar] = [] for i, b in enumerate(blocks): try: values = self._prep_values(b.values) diff --git a/pandas/io/common.py b/pandas/io/common.py index bd3808cf37b6b..c0eddb679c6f8 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -418,7 +418,7 @@ def _get_handle( except ImportError: need_text_wrapping = BufferedIOBase # type: ignore - handles = list() # type: List[IO] + handles: List[IO] = list() f = path_or_buf # Convert pathlib.Path/py.path.local or string diff --git a/pandas/io/excel/_odfreader.py b/pandas/io/excel/_odfreader.py index 97556f9685001..78054936f50f2 100644 --- a/pandas/io/excel/_odfreader.py +++ b/pandas/io/excel/_odfreader.py @@ -76,12 +76,12 @@ def get_sheet_data(self, sheet, convert_float: bool) -> List[List[Scalar]]: empty_rows = 0 max_row_len = 0 - table = [] # type: List[List[Scalar]] + table: List[List[Scalar]] = [] for i, sheet_row in enumerate(sheet_rows): sheet_cells = [x for x in sheet_row.childNodes if x.qname in cell_names] empty_cells = 0 - table_row = [] # type: List[Scalar] + table_row: List[Scalar] = [] for j, sheet_cell in enumerate(sheet_cells): if sheet_cell.qname == table_cell_name: diff --git a/pandas/io/excel/_openpyxl.py b/pandas/io/excel/_openpyxl.py index d0d6096a4425e..d278c6b3bbef2 100644 --- a/pandas/io/excel/_openpyxl.py +++ b/pandas/io/excel/_openpyxl.py @@ -531,7 +531,7 @@ def _convert_cell(self, cell, convert_float: bool) -> Scalar: return cell.value def get_sheet_data(self, sheet, convert_float: bool) -> List[List[Scalar]]: - data = [] # type: List[List[Scalar]] + data: List[List[Scalar]] = [] for row in sheet.rows: data.append([self._convert_cell(cell, convert_float) for cell in row]) diff --git a/pandas/io/formats/format.py b/pandas/io/formats/format.py index 41bddc7683764..b18f0db622b3e 100644 --- a/pandas/io/formats/format.py +++ b/pandas/io/formats/format.py @@ -262,6 +262,8 @@ def __init__( def _chk_truncate(self) -> None: from pandas.core.reshape.concat import concat + self.tr_row_num: Optional[int] + min_rows = self.min_rows max_rows = self.max_rows # truncation determined by max_rows, actual truncated number of rows @@ -280,7 +282,7 @@ def _chk_truncate(self) -> None: else: row_num = max_rows // 2 series = concat((series.iloc[:row_num], series.iloc[-row_num:])) - self.tr_row_num = row_num # type: Optional[int] + self.tr_row_num = row_num else: self.tr_row_num = None self.tr_series = series @@ -448,13 +450,13 @@ def _get_adjustment() -> TextAdjustment: class TableFormatter: - show_dimensions = None # type: bool - is_truncated = None # type: bool - formatters = None # type: formatters_type - columns = None # type: Index + show_dimensions: bool + is_truncated: bool + formatters: formatters_type + columns: Index @property - def should_show_dimensions(self) -> Optional[bool]: + def should_show_dimensions(self) -> bool: return self.show_dimensions is True or ( self.show_dimensions == "truncate" and self.is_truncated ) @@ -616,6 +618,8 @@ def _chk_truncate(self) -> None: # Cut the data to the information actually printed max_cols = self.max_cols max_rows = self.max_rows + self.max_rows_adj: Optional[int] + max_rows_adj: Optional[int] if max_cols == 0 or max_rows == 0: # assume we are in the terminal (w, h) = get_terminal_size() @@ -631,7 +635,7 @@ def _chk_truncate(self) -> None: self.header = cast(bool, self.header) n_add_rows = self.header + dot_row + show_dimension_rows + prompt_row # rows available to fill with actual data - max_rows_adj = self.h - n_add_rows # type: Optional[int] + max_rows_adj = self.h - n_add_rows self.max_rows_adj = max_rows_adj # Format only rows and columns that could potentially fit the @@ -1073,7 +1077,7 @@ def _get_formatted_index(self, frame: "DataFrame") -> List[str]: return adjoined def _get_column_name_list(self) -> List[str]: - names = [] # type: List[str] + names: List[str] = [] columns = self.frame.columns if isinstance(columns, ABCMultiIndex): names.extend("" if name is None else name for name in columns.names) @@ -1124,8 +1128,9 @@ def format_array( List[str] """ + fmt_klass: Type[GenericArrayFormatter] if is_datetime64_dtype(values.dtype): - fmt_klass = Datetime64Formatter # type: Type[GenericArrayFormatter] + fmt_klass = Datetime64Formatter elif is_datetime64tz_dtype(values): fmt_klass = Datetime64TZFormatter elif is_timedelta64_dtype(values.dtype): @@ -1375,11 +1380,12 @@ def format_values_with(float_format): # There is a special default string when we are fixed-width # The default is otherwise to use str instead of a formatting string + float_format: Optional[float_format_type] if self.float_format is None: if self.fixed_width: float_format = partial( "{value: .{digits:d}f}".format, digits=self.digits - ) # type: Optional[float_format_type] + ) else: float_format = self.float_format else: diff --git a/pandas/io/formats/html.py b/pandas/io/formats/html.py index 38f2e332017f0..0c6b0c1a5810b 100644 --- a/pandas/io/formats/html.py +++ b/pandas/io/formats/html.py @@ -45,7 +45,7 @@ def __init__( self.frame = self.fmt.frame self.columns = self.fmt.tr_frame.columns - self.elements = [] # type: List[str] + self.elements: List[str] = [] self.bold_rows = self.fmt.bold_rows self.escape = self.fmt.escape self.show_dimensions = self.fmt.show_dimensions @@ -138,11 +138,10 @@ def _write_cell( else: start_tag = "<{kind}>".format(kind=kind) + esc: Union[OrderedDict[str, str], Dict] if self.escape: # escape & first to prevent double escaping of & - esc = OrderedDict( - [("&", r"&"), ("<", r"<"), (">", r">")] - ) # type: Union[OrderedDict[str, str], Dict] + esc = OrderedDict([("&", r"&"), ("<", r"<"), (">", r">")]) else: esc = {} @@ -408,7 +407,7 @@ def _write_regular_rows( else: index_values = self.fmt.tr_frame.index.format() - row = [] # type: List[str] + row: List[str] = [] for i in range(nrows): if truncate_v and i == (self.fmt.tr_row_num): diff --git a/pandas/io/formats/latex.py b/pandas/io/formats/latex.py index 6f903e770c86c..008a99427f3c7 100644 --- a/pandas/io/formats/latex.py +++ b/pandas/io/formats/latex.py @@ -133,7 +133,7 @@ def pad_empties(x): if self.fmt.has_index_names and self.fmt.show_index_names: nlevels += 1 strrows = list(zip(*strcols)) - self.clinebuf = [] # type: List[List[int]] + self.clinebuf: List[List[int]] = [] for i, row in enumerate(strrows): if i == nlevels and self.fmt.header: diff --git a/pandas/io/formats/printing.py b/pandas/io/formats/printing.py index 061103820ca83..a4f1488fb6b69 100644 --- a/pandas/io/formats/printing.py +++ b/pandas/io/formats/printing.py @@ -513,7 +513,7 @@ def format_object_attrs( list of 2-tuple """ - attrs = [] # type: List[Tuple[str, Union[str, int]]] + attrs: List[Tuple[str, Union[str, int]]] = [] if hasattr(obj, "dtype") and include_dtype: # error: "Sequence[Any]" has no attribute "dtype" attrs.append(("dtype", "'{}'".format(obj.dtype))) # type: ignore diff --git a/pandas/io/json/_json.py b/pandas/io/json/_json.py index 26a3248262f9a..89d5b52ffbf1e 100644 --- a/pandas/io/json/_json.py +++ b/pandas/io/json/_json.py @@ -62,8 +62,10 @@ def to_json( if orient == "table" and isinstance(obj, Series): obj = obj.to_frame(name=obj.name or "values") + + writer: Type["Writer"] if orient == "table" and isinstance(obj, DataFrame): - writer = JSONTableWriter # type: Type["Writer"] + writer = JSONTableWriter elif isinstance(obj, Series): writer = SeriesWriter elif isinstance(obj, DataFrame): diff --git a/pandas/io/json/_normalize.py b/pandas/io/json/_normalize.py index 702241bde2b34..df513d4d37d71 100644 --- a/pandas/io/json/_normalize.py +++ b/pandas/io/json/_normalize.py @@ -267,10 +267,10 @@ def _pull_field(js, spec): meta = [m if isinstance(m, list) else [m] for m in meta] # Disastrously inefficient for now - records = [] # type: List + records: List = [] lengths = [] - meta_vals = defaultdict(list) # type: DefaultDict + meta_vals: DefaultDict = defaultdict(list) meta_keys = [sep.join(val) for val in meta] def _recursive_extract(data, path, seen_meta, level=0): diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index ff3583b79d79c..cf1511c1221b3 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -522,8 +522,8 @@ def _read(filepath_or_buffer: FilePathOrBuffer, kwds): _c_unsupported = {"skipfooter"} _python_unsupported = {"low_memory", "float_precision"} -_deprecated_defaults = {} # type: Dict[str, Any] -_deprecated_args = set() # type: Set[str] +_deprecated_defaults: Dict[str, Any] = {} +_deprecated_args: Set[str] = set() def _make_parser_function(name, default_sep=","): diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index 9a1bfdd2be798..ba53d8cfd0de5 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -1343,7 +1343,7 @@ def copy( data = self.select(k) if s.is_table: - index = False # type: Union[bool, list] + index: Union[bool, list] = False if propindexes: index = [a.name for a in s.axes if a.is_indexed] new_store.append( @@ -2548,9 +2548,9 @@ class Fixed: group : the group node where the table resides """ - pandas_kind = None # type: str - obj_type = None # type: Type[Union[DataFrame, Series]] - ndim = None # type: int + pandas_kind: str + obj_type: Type[Union[DataFrame, Series]] + ndim: int is_table = False def __init__(self, parent, group, encoding=None, errors="strict", **kwargs): @@ -2708,7 +2708,7 @@ class GenericFixed(Fixed): _index_type_map = {DatetimeIndex: "datetime", PeriodIndex: "period"} _reverse_index_map = {v: k for k, v in _index_type_map.items()} - attributes = [] # type: List[str] + attributes: List[str] = [] # indexer helpders def _class_to_alias(self, cls) -> str: @@ -3254,7 +3254,7 @@ class Table(Fixed): """ pandas_kind = "wide_table" - table_type = None # type: str + table_type: str levels = 1 is_table = True is_shape_reversed = False @@ -4147,11 +4147,11 @@ class LegacyTable(Table): """ - _indexables = [ + _indexables: Optional[List[IndexCol]] = [ IndexCol(name="index", axis=1, pos=0), IndexCol(name="column", axis=2, pos=1, index_kind="columns_kind"), DataCol(name="fields", cname="values", kind_attr="fields", pos=2), - ] # type: Optional[List[IndexCol]] + ] table_type = "legacy" ndim = 3 @@ -4424,7 +4424,7 @@ class AppendableFrameTable(AppendableTable): pandas_kind = "frame_table" table_type = "appendable_frame" ndim = 2 - obj_type = DataFrame # type: Type[Union[DataFrame, Series]] + obj_type: Type[Union[DataFrame, Series]] = DataFrame @property def is_transposed(self) -> bool: @@ -4650,7 +4650,7 @@ def _reindex_axis(obj, axis: int, labels: Index, other=None): if other is not None: labels = ensure_index(other.unique()).intersection(labels, sort=False) if not labels.equals(ax): - slicer = [slice(None, None)] * obj.ndim # type: List[Union[slice, Index]] + slicer: List[Union[slice, Index]] = [slice(None, None)] * obj.ndim slicer[axis] = labels obj = obj.loc[tuple(slicer)] return obj diff --git a/pandas/plotting/_matplotlib/core.py b/pandas/plotting/_matplotlib/core.py index 5341dc3a6338a..0c5375ccc5d5c 100644 --- a/pandas/plotting/_matplotlib/core.py +++ b/pandas/plotting/_matplotlib/core.py @@ -57,7 +57,7 @@ def _kind(self): _layout_type = "vertical" _default_rot = 0 - orientation = None # type: Optional[str] + orientation: Optional[str] = None _pop_attributes = [ "label", "style", diff --git a/pandas/tests/api/test_api.py b/pandas/tests/api/test_api.py index 601fde80e9a94..5d11e160bbd71 100644 --- a/pandas/tests/api/test_api.py +++ b/pandas/tests/api/test_api.py @@ -43,7 +43,7 @@ class TestPDApi(Base): ] # these are already deprecated; awaiting removal - deprecated_modules = [] # type: List[str] + deprecated_modules: List[str] = [] # misc misc = ["IndexSlice", "NaT"] @@ -94,10 +94,10 @@ class TestPDApi(Base): classes.extend(["Panel", "SparseSeries", "SparseDataFrame"]) # these are already deprecated; awaiting removal - deprecated_classes = [] # type: List[str] + deprecated_classes: List[str] = [] # these should be deprecated in the future - deprecated_classes_in_future = [] # type: List[str] + deprecated_classes_in_future: List[str] = [] # external modules exposed in pandas namespace modules = ["np", "datetime"] @@ -173,10 +173,10 @@ class TestPDApi(Base): funcs_to = ["to_datetime", "to_msgpack", "to_numeric", "to_pickle", "to_timedelta"] # top-level to deprecate in the future - deprecated_funcs_in_future = [] # type: List[str] + deprecated_funcs_in_future: List[str] = [] # these are already deprecated; awaiting removal - deprecated_funcs = [] # type: List[str] + deprecated_funcs: List[str] = [] # private modules in pandas namespace private_modules = [ diff --git a/pandas/tests/arrays/test_datetimelike.py b/pandas/tests/arrays/test_datetimelike.py index 3bacd560e75cf..5cab0c1fe6d59 100644 --- a/pandas/tests/arrays/test_datetimelike.py +++ b/pandas/tests/arrays/test_datetimelike.py @@ -57,7 +57,7 @@ def timedelta_index(request): class SharedTests: - index_cls = None # type: Type[Union[DatetimeIndex, PeriodIndex, TimedeltaIndex]] + index_cls: Type[Union[DatetimeIndex, PeriodIndex, TimedeltaIndex]] def test_compare_len1_raises(self): # make sure we raise when comparing with different lengths, specific diff --git a/pandas/tests/computation/test_eval.py b/pandas/tests/computation/test_eval.py index a075521b67561..c6ce08080314a 100644 --- a/pandas/tests/computation/test_eval.py +++ b/pandas/tests/computation/test_eval.py @@ -1889,11 +1889,11 @@ def test_invalid_parser(): pd.eval("x + y", local_dict={"x": 1, "y": 2}, parser="asdf") -_parsers = { +_parsers: Dict[str, Type[BaseExprVisitor]] = { "python": PythonExprVisitor, "pytables": pytables.ExprVisitor, "pandas": PandasExprVisitor, -} # type: Dict[str, Type[BaseExprVisitor]] +} @pytest.mark.parametrize("engine", _engines) diff --git a/pandas/tests/dtypes/test_common.py b/pandas/tests/dtypes/test_common.py index ae625ed8e389f..6d91d13027f69 100644 --- a/pandas/tests/dtypes/test_common.py +++ b/pandas/tests/dtypes/test_common.py @@ -290,7 +290,7 @@ def test_is_datetime_arraylike(): assert com.is_datetime_arraylike(pd.DatetimeIndex([1, 2, 3])) -integer_dtypes = [] # type: List +integer_dtypes: List = [] @pytest.mark.parametrize( @@ -322,7 +322,7 @@ def test_is_not_integer_dtype(dtype): assert not com.is_integer_dtype(dtype) -signed_integer_dtypes = [] # type: List +signed_integer_dtypes: List = [] @pytest.mark.parametrize( @@ -358,7 +358,7 @@ def test_is_not_signed_integer_dtype(dtype): assert not com.is_signed_integer_dtype(dtype) -unsigned_integer_dtypes = [] # type: List +unsigned_integer_dtypes: List = [] @pytest.mark.parametrize( diff --git a/pandas/tests/extension/base/ops.py b/pandas/tests/extension/base/ops.py index e968962caf0b7..5e4fb6d69e52c 100644 --- a/pandas/tests/extension/base/ops.py +++ b/pandas/tests/extension/base/ops.py @@ -62,10 +62,10 @@ class BaseArithmeticOpsTests(BaseOpsUtil): * divmod_exc = TypeError """ - series_scalar_exc = TypeError # type: Optional[Type[TypeError]] - frame_scalar_exc = TypeError # type: Optional[Type[TypeError]] - series_array_exc = TypeError # type: Optional[Type[TypeError]] - divmod_exc = TypeError # type: Optional[Type[TypeError]] + series_scalar_exc: Optional[Type[TypeError]] = TypeError + frame_scalar_exc: Optional[Type[TypeError]] = TypeError + series_array_exc: Optional[Type[TypeError]] = TypeError + divmod_exc: Optional[Type[TypeError]] = TypeError def test_arith_series_with_scalar(self, data, all_arithmetic_operators): # series & scalar diff --git a/pandas/tests/indexes/common.py b/pandas/tests/indexes/common.py index 1ac6370860ba6..c35c4c3568f74 100644 --- a/pandas/tests/indexes/common.py +++ b/pandas/tests/indexes/common.py @@ -31,7 +31,7 @@ class Base: """ base class for index sub-class tests """ - _holder = None # type: Optional[Type[Index]] + _holder: Optional[Type[Index]] = None _compat_props = ["shape", "ndim", "size", "nbytes"] def test_pickle_compat_construction(self): diff --git a/pandas/tests/indexing/test_coercion.py b/pandas/tests/indexing/test_coercion.py index 469c011001467..8b29cf3813d13 100644 --- a/pandas/tests/indexing/test_coercion.py +++ b/pandas/tests/indexing/test_coercion.py @@ -927,7 +927,7 @@ class TestReplaceSeriesCoercion(CoercionBase): klasses = ["series"] method = "replace" - rep = {} # type: Dict[str, List] + rep: Dict[str, List] = {} rep["object"] = ["a", "b"] rep["int64"] = [4, 5] rep["float64"] = [1.1, 2.2] diff --git a/pandas/tests/io/parser/conftest.py b/pandas/tests/io/parser/conftest.py index 183ad500b15f3..a87e1e796c194 100644 --- a/pandas/tests/io/parser/conftest.py +++ b/pandas/tests/io/parser/conftest.py @@ -7,9 +7,9 @@ class BaseParser: - engine = None # type: Optional[str] + engine: Optional[str] = None low_memory = True - float_precision_choices = [] # type: List[Optional[str]] + float_precision_choices: List[Optional[str]] = [] def update_kwargs(self, kwargs): kwargs = kwargs.copy() diff --git a/pandas/tests/io/test_sql.py b/pandas/tests/io/test_sql.py index 1c80dd9e59164..fe65820a7c975 100644 --- a/pandas/tests/io/test_sql.py +++ b/pandas/tests/io/test_sql.py @@ -583,7 +583,7 @@ class _TestSQLApi(PandasSQLTest): """ flavor = "sqlite" - mode = None # type: str + mode: str def setup_connect(self): self.conn = self.connect() @@ -1234,7 +1234,7 @@ class _TestSQLAlchemy(SQLAlchemyMixIn, PandasSQLTest): """ - flavor = None # type: str + flavor: str @pytest.fixture(autouse=True, scope="class") def setup_class(cls): diff --git a/pandas/tests/tseries/offsets/test_offsets.py b/pandas/tests/tseries/offsets/test_offsets.py index e443a7cc932be..d70780741aa88 100644 --- a/pandas/tests/tseries/offsets/test_offsets.py +++ b/pandas/tests/tseries/offsets/test_offsets.py @@ -1,5 +1,5 @@ from datetime import date, datetime, time as dt_time, timedelta -from typing import Dict, List, Tuple, Type +from typing import Dict, List, Optional, Tuple, Type import numpy as np import pytest @@ -95,7 +95,7 @@ def test_to_M8(): class Base: - _offset = None # type: Type[DateOffset] + _offset: Optional[Type[DateOffset]] = None d = Timestamp(datetime(2008, 1, 2)) timezones = [ @@ -743,7 +743,7 @@ def test_onOffset(self): for offset, d, expected in tests: assert_onOffset(offset, d, expected) - apply_cases = [] # type: _ApplyCases + apply_cases: _ApplyCases = [] apply_cases.append( ( BDay(), @@ -2631,7 +2631,7 @@ def test_onOffset(self, case): offset, d, expected = case assert_onOffset(offset, d, expected) - apply_cases = [] # type: _ApplyCases + apply_cases: _ApplyCases = [] apply_cases.append( ( CDay(), @@ -2878,7 +2878,7 @@ def test_onOffset(self, case): offset, d, expected = case assert_onOffset(offset, d, expected) - apply_cases = [] # type: _ApplyCases + apply_cases: _ApplyCases = [] apply_cases.append( ( CBMonthEnd(), @@ -3027,7 +3027,7 @@ def test_onOffset(self, case): offset, dt, expected = case assert_onOffset(offset, dt, expected) - apply_cases = [] # type: _ApplyCases + apply_cases: _ApplyCases = [] apply_cases.append( ( CBMonthBegin(), diff --git a/pandas/tseries/frequencies.py b/pandas/tseries/frequencies.py index 9ec0dce438099..898060d011372 100644 --- a/pandas/tseries/frequencies.py +++ b/pandas/tseries/frequencies.py @@ -49,7 +49,7 @@ # Offset names ("time rules") and related functions #: cache of previously seen offsets -_offset_map = {} # type: Dict[str, DateOffset] +_offset_map: Dict[str, DateOffset] = {} def get_period_alias(offset_str): diff --git a/pandas/tseries/holiday.py b/pandas/tseries/holiday.py index d4f02286ff8d6..9417dc4b48499 100644 --- a/pandas/tseries/holiday.py +++ b/pandas/tseries/holiday.py @@ -344,7 +344,7 @@ class AbstractHolidayCalendar(metaclass=HolidayCalendarMetaClass): Abstract interface to create holidays following certain rules. """ - rules = [] # type: List[Holiday] + rules: List[Holiday] = [] start_date = Timestamp(datetime(1970, 1, 1)) end_date = Timestamp(datetime(2200, 12, 31)) _cache = None diff --git a/pandas/tseries/offsets.py b/pandas/tseries/offsets.py index f5e40e712642e..e516d30d5490f 100644 --- a/pandas/tseries/offsets.py +++ b/pandas/tseries/offsets.py @@ -1817,8 +1817,8 @@ class QuarterOffset(DateOffset): Quarter representation - doesn't call super. """ - _default_startingMonth = None # type: Optional[int] - _from_name_startingMonth = None # type: Optional[int] + _default_startingMonth: Optional[int] = None + _from_name_startingMonth: Optional[int] = None _adjust_dst = True _attributes = frozenset(["n", "normalize", "startingMonth"]) # TODO: Consider combining QuarterOffset and YearOffset __init__ at some diff --git a/pandas/util/_decorators.py b/pandas/util/_decorators.py index f8c08ed8c099f..b8f17cd848292 100644 --- a/pandas/util/_decorators.py +++ b/pandas/util/_decorators.py @@ -327,9 +327,11 @@ def my_dog(has='fleas'): pass """ + addendum: Optional[str] + def __init__(self, addendum: Optional[str], join: str = "", indents: int = 0): if indents > 0: - self.addendum = indent(addendum, indents=indents) # type: Optional[str] + self.addendum = indent(addendum, indents=indents) else: self.addendum = addendum self.join = join From c5576293859f4351e508471811948e9a1dac4a30 Mon Sep 17 00:00:00 2001 From: ohad83 Date: Fri, 22 Nov 2019 00:46:03 +0200 Subject: [PATCH 157/185] CLN - Change string formatting in plotting (#29781) --- pandas/plotting/_core.py | 39 ++++++++++------------- pandas/plotting/_matplotlib/boxplot.py | 12 +++---- pandas/plotting/_matplotlib/converter.py | 25 +++++++-------- pandas/plotting/_matplotlib/core.py | 23 ++++++------- pandas/plotting/_matplotlib/misc.py | 2 +- pandas/plotting/_matplotlib/style.py | 2 +- pandas/plotting/_matplotlib/timeseries.py | 3 +- pandas/plotting/_matplotlib/tools.py | 9 ++---- pandas/plotting/_misc.py | 6 ++-- 9 files changed, 52 insertions(+), 69 deletions(-) diff --git a/pandas/plotting/_core.py b/pandas/plotting/_core.py index da1e06dccc65d..beb276478070e 100644 --- a/pandas/plotting/_core.py +++ b/pandas/plotting/_core.py @@ -736,26 +736,23 @@ def _get_call_args(backend_name, data, args, kwargs): ] else: raise TypeError( - ( - "Called plot accessor for type {}, expected Series or DataFrame" - ).format(type(data).__name__) + f"Called plot accessor for type {type(data).__name__}, " + "expected Series or DataFrame" ) if args and isinstance(data, ABCSeries): + positional_args = str(args)[1:-1] + keyword_args = ", ".join( + f"{name}={value!r}" for (name, default), value in zip(arg_def, args) + ) msg = ( "`Series.plot()` should not be called with positional " "arguments, only keyword arguments. The order of " "positional arguments will change in the future. " - "Use `Series.plot({})` instead of `Series.plot({})`." - ) - positional_args = str(args)[1:-1] - keyword_args = ", ".join( - "{}={!r}".format(name, value) - for (name, default), value in zip(arg_def, args) - ) - warnings.warn( - msg.format(keyword_args, positional_args), FutureWarning, stacklevel=3 + f"Use `Series.plot({keyword_args})` instead of " + f"`Series.plot({positional_args})`." ) + warnings.warn(msg, FutureWarning, stacklevel=3) pos_args = {name: value for value, (name, _) in zip(args, arg_def)} if backend_name == "pandas.plotting._matplotlib": @@ -782,7 +779,7 @@ def __call__(self, *args, **kwargs): return plot_backend.plot(self._parent, x=x, y=y, kind=kind, **kwargs) if kind not in self._all_kinds: - raise ValueError("{} is not a valid plot kind".format(kind)) + raise ValueError(f"{kind} is not a valid plot kind") # The original data structured can be transformed before passed to the # backend. For example, for DataFrame is common to set the index as the @@ -796,14 +793,13 @@ def __call__(self, *args, **kwargs): if isinstance(data, ABCDataFrame): return plot_backend.plot(data, x=x, y=y, kind=kind, **kwargs) else: - raise ValueError( - ("plot kind {} can only be used for data frames").format(kind) - ) + raise ValueError(f"plot kind {kind} can only be used for data frames") elif kind in self._series_kinds: if isinstance(data, ABCDataFrame): if y is None and kwargs.get("subplots") is False: - msg = "{} requires either y column or 'subplots=True'" - raise ValueError(msg.format(kind)) + raise ValueError( + f"{kind} requires either y column or 'subplots=True'" + ) elif y is not None: if is_integer(y) and not data.columns.holds_integer(): y = data.columns[y] @@ -1639,12 +1635,11 @@ def _find_backend(backend: str): _backends[backend] = module return module - msg = ( - "Could not find plotting backend '{name}'. Ensure that you've installed the " - "package providing the '{name}' entrypoint, or that the package has a" + raise ValueError( + f"Could not find plotting backend '{backend}'. Ensure that you've installed " + f"the package providing the '{backend}' entrypoint, or that the package has a " "top-level `.plot` method." ) - raise ValueError(msg.format(name=backend)) def _get_plot_backend(backend=None): diff --git a/pandas/plotting/_matplotlib/boxplot.py b/pandas/plotting/_matplotlib/boxplot.py index 274f06cd3ec1d..7bcca659ee3f6 100644 --- a/pandas/plotting/_matplotlib/boxplot.py +++ b/pandas/plotting/_matplotlib/boxplot.py @@ -74,9 +74,8 @@ def _validate_color_args(self): for key, values in self.color.items(): if key not in valid_keys: raise ValueError( - "color dict contains invalid " - "key '{0}' " - "The key must be either {1}".format(key, valid_keys) + f"color dict contains invalid key '{key}'. " + f"The key must be either {valid_keys}" ) else: self.color = None @@ -217,7 +216,7 @@ def _grouped_plot_by_column( result = axes byline = by[0] if len(by) == 1 else by - fig.suptitle("Boxplot grouped by {byline}".format(byline=byline)) + fig.suptitle(f"Boxplot grouped by {byline}") fig.subplots_adjust(bottom=0.15, top=0.9, left=0.1, right=0.9, wspace=0.2) return result @@ -268,9 +267,8 @@ def _get_colors(): result[key_to_index[key]] = value else: raise ValueError( - "color dict contains invalid " - "key '{0}' " - "The key must be either {1}".format(key, valid_keys) + f"color dict contains invalid key '{key}'. " + f"The key must be either {valid_keys}" ) else: result.fill(colors) diff --git a/pandas/plotting/_matplotlib/converter.py b/pandas/plotting/_matplotlib/converter.py index 4b0ba2bd423df..feb895a099da5 100644 --- a/pandas/plotting/_matplotlib/converter.py +++ b/pandas/plotting/_matplotlib/converter.py @@ -125,7 +125,7 @@ def time2num(d): if isinstance(d, str): parsed = tools.to_datetime(d) if not isinstance(parsed, datetime): - raise ValueError("Could not parse time {d}".format(d=d)) + raise ValueError(f"Could not parse time {d}") return _to_ordinalf(parsed.time()) if isinstance(d, pydt.time): return _to_ordinalf(d) @@ -244,7 +244,7 @@ def get_datevalue(date, freq): return date elif date is None: return None - raise ValueError("Unrecognizable date '{date}'".format(date=date)) + raise ValueError(f"Unrecognizable date '{date}'") def _dt_to_float_ordinal(dt): @@ -421,12 +421,10 @@ def __call__(self): if estimate > self.MAXTICKS * 2: raise RuntimeError( - ( - "MillisecondLocator estimated to generate " - "{estimate:d} ticks from {dmin} to {dmax}: " - "exceeds Locator.MAXTICKS" - "* 2 ({arg:d}) " - ).format(estimate=estimate, dmin=dmin, dmax=dmax, arg=self.MAXTICKS * 2) + "MillisecondLocator estimated to generate " + f"{estimate:d} ticks from {dmin} to {dmax}: " + "exceeds Locator.MAXTICKS" + f"* 2 ({self.MAXTICKS * 2:d}) " ) interval = self._get_interval() @@ -582,7 +580,7 @@ def _daily_finder(vmin, vmax, freq): elif freq == FreqGroup.FR_HR: periodsperday = 24 else: # pragma: no cover - raise ValueError("unexpected frequency: {freq}".format(freq=freq)) + raise ValueError(f"unexpected frequency: {freq}") periodsperyear = 365 * periodsperday periodspermonth = 28 * periodsperday @@ -941,8 +939,7 @@ def get_finder(freq): elif (freq >= FreqGroup.FR_BUS) or fgroup == FreqGroup.FR_WK: return _daily_finder else: # pragma: no cover - errmsg = "Unsupported frequency: {freq}".format(freq=freq) - raise NotImplementedError(errmsg) + raise NotImplementedError(f"Unsupported frequency: {freq}") class TimeSeries_DateLocator(Locator): @@ -1119,11 +1116,11 @@ def format_timedelta_ticks(x, pos, n_decimals): h, m = divmod(m, 60) d, h = divmod(h, 24) decimals = int(ns * 10 ** (n_decimals - 9)) - s = r"{:02d}:{:02d}:{:02d}".format(int(h), int(m), int(s)) + s = f"{int(h):02d}:{int(m):02d}:{int(s):02d}" if n_decimals > 0: - s += ".{{:0{:0d}d}}".format(n_decimals).format(decimals) + s += f".{decimals:0{n_decimals}d}" if d != 0: - s = "{:d} days ".format(int(d)) + s + s = f"{int(d):d} days {s}" return s def __call__(self, x, pos=0): diff --git a/pandas/plotting/_matplotlib/core.py b/pandas/plotting/_matplotlib/core.py index 0c5375ccc5d5c..f2efed30c48e8 100644 --- a/pandas/plotting/_matplotlib/core.py +++ b/pandas/plotting/_matplotlib/core.py @@ -349,8 +349,7 @@ def _setup_subplots(self): if input_log - valid_log: invalid_log = next(iter((input_log - valid_log))) raise ValueError( - "Boolean, None and 'sym' are valid options," - " '{}' is given.".format(invalid_log) + f"Boolean, None and 'sym' are valid options, '{invalid_log}' is given." ) if self.logx is True or self.loglog is True: @@ -501,14 +500,13 @@ def _adorn_subplots(self): if self.subplots: if is_list_like(self.title): if len(self.title) != self.nseries: - msg = ( + raise ValueError( "The length of `title` must equal the number " "of columns if using `title` of type `list` " "and `subplots=True`.\n" - "length of title = {}\n" - "number of columns = {}" - ).format(len(self.title), self.nseries) - raise ValueError(msg) + f"length of title = {len(self.title)}\n" + f"number of columns = {self.nseries}" + ) for (ax, title) in zip(self.axes, self.title): ax.set_title(title) @@ -813,11 +811,10 @@ def match_labels(data, e): or (err_shape[1] != 2) or (err_shape[2] != len(self.data)) ): - msg = ( + raise ValueError( "Asymmetrical error bars should be provided " - + "with the shape (%u, 2, %u)" % (self.nseries, len(self.data)) + f"with the shape ({self.nseries}, 2, {len(self.data)})" ) - raise ValueError(msg) # broadcast errors to each data series if len(err) == 1: @@ -827,7 +824,7 @@ def match_labels(data, e): err = np.tile([err], (self.nseries, len(self.data))) else: - msg = "No valid {label} detected".format(label=label) + msg = f"No valid {label} detected" raise ValueError(msg) return err @@ -1178,7 +1175,7 @@ def _get_stacked_values(cls, ax, stacking_id, values, label): raise ValueError( "When stacked is True, each column must be either " "all positive or negative." - "{0} contains both positive and negative values".format(label) + f"{label} contains both positive and negative values" ) @classmethod @@ -1473,7 +1470,7 @@ class PiePlot(MPLPlot): def __init__(self, data, kind=None, **kwargs): data = data.fillna(value=0) if (data < 0).any().any(): - raise ValueError("{0} doesn't allow negative values".format(kind)) + raise ValueError(f"{kind} doesn't allow negative values") MPLPlot.__init__(self, data, kind=kind, **kwargs) def _args_adjust(self): diff --git a/pandas/plotting/_matplotlib/misc.py b/pandas/plotting/_matplotlib/misc.py index 6d2363668e650..0720f544203f7 100644 --- a/pandas/plotting/_matplotlib/misc.py +++ b/pandas/plotting/_matplotlib/misc.py @@ -395,7 +395,7 @@ def lag_plot(series, lag=1, ax=None, **kwds): if ax is None: ax = plt.gca() ax.set_xlabel("y(t)") - ax.set_ylabel("y(t + {lag})".format(lag=lag)) + ax.set_ylabel(f"y(t + {lag})") ax.scatter(y1, y2, **kwds) return ax diff --git a/pandas/plotting/_matplotlib/style.py b/pandas/plotting/_matplotlib/style.py index 927b9cf4e392a..fd69265b18a5b 100644 --- a/pandas/plotting/_matplotlib/style.py +++ b/pandas/plotting/_matplotlib/style.py @@ -20,7 +20,7 @@ def _get_standard_colors( cmap = colormap colormap = cm.get_cmap(colormap) if colormap is None: - raise ValueError("Colormap {0} is not recognized".format(cmap)) + raise ValueError(f"Colormap {cmap} is not recognized") colors = [colormap(num) for num in np.linspace(0, 1, num=num_colors)] elif color is not None: if colormap is not None: diff --git a/pandas/plotting/_matplotlib/timeseries.py b/pandas/plotting/_matplotlib/timeseries.py index 931c699d9b9fd..fa9585e1fc229 100644 --- a/pandas/plotting/_matplotlib/timeseries.py +++ b/pandas/plotting/_matplotlib/timeseries.py @@ -307,7 +307,8 @@ def _maybe_convert_index(ax, data): def _format_coord(freq, t, y): - return "t = {0} y = {1:8f}".format(Period(ordinal=int(t), freq=freq), y) + time_period = Period(ordinal=int(t), freq=freq) + return f"t = {time_period} y = {y:8f}" def format_dateaxis(subplot, freq, index): diff --git a/pandas/plotting/_matplotlib/tools.py b/pandas/plotting/_matplotlib/tools.py index bcbe5eea8b5ab..dd4034a97f58e 100644 --- a/pandas/plotting/_matplotlib/tools.py +++ b/pandas/plotting/_matplotlib/tools.py @@ -60,10 +60,7 @@ def _get_layout(nplots, layout=None, layout_type="box"): if nrows * ncols < nplots: raise ValueError( - "Layout of {nrows}x{ncols} must be larger " - "than required size {nplots}".format( - nrows=nrows, ncols=ncols, nplots=nplots - ) + f"Layout of {nrows}x{ncols} must be larger than required size {nplots}" ) return layout @@ -203,8 +200,8 @@ def _subplots( return fig, ax else: raise ValueError( - "The number of passed axes must be {0}, the " - "same as the output plot".format(naxes) + f"The number of passed axes must be {naxes}, the " + "same as the output plot" ) fig = ax.get_figure() diff --git a/pandas/plotting/_misc.py b/pandas/plotting/_misc.py index b8f5a0d83b5c1..1087d314b1bf7 100644 --- a/pandas/plotting/_misc.py +++ b/pandas/plotting/_misc.py @@ -446,9 +446,7 @@ def __init__(self, deprecated=False): def __getitem__(self, key): key = self._get_canonical_key(key) if key not in self: - raise ValueError( - "{key} is not a valid pandas plotting option".format(key=key) - ) + raise ValueError(f"{key} is not a valid pandas plotting option") return super().__getitem__(key) def __setitem__(self, key, value): @@ -458,7 +456,7 @@ def __setitem__(self, key, value): def __delitem__(self, key): key = self._get_canonical_key(key) if key in self._DEFAULT_KEYS: - raise ValueError("Cannot remove default parameter {key}".format(key=key)) + raise ValueError(f"Cannot remove default parameter {key}") return super().__delitem__(key) def __contains__(self, key): From f9dae670f3cddf7eb7a39e9a32fb5095a452c2a0 Mon Sep 17 00:00:00 2001 From: Marc Garcia Date: Fri, 22 Nov 2019 01:14:46 +0000 Subject: [PATCH 158/185] DEPS: Removing unused pytest-mock (#29784) --- ci/deps/azure-36-32bit.yaml | 1 - ci/deps/azure-36-locale.yaml | 1 - ci/deps/azure-36-locale_slow.yaml | 1 - ci/deps/azure-36-minimum_versions.yaml | 1 - ci/deps/azure-37-locale.yaml | 1 - ci/deps/azure-37-numpydev.yaml | 1 - ci/deps/azure-macos-36.yaml | 1 - ci/deps/azure-windows-36.yaml | 1 - ci/deps/azure-windows-37.yaml | 1 - ci/deps/travis-36-cov.yaml | 1 - ci/deps/travis-36-locale.yaml | 1 - ci/deps/travis-36-slow.yaml | 1 - ci/deps/travis-37.yaml | 1 - 13 files changed, 13 deletions(-) diff --git a/ci/deps/azure-36-32bit.yaml b/ci/deps/azure-36-32bit.yaml index f3e3d577a7a33..cf3fca307481f 100644 --- a/ci/deps/azure-36-32bit.yaml +++ b/ci/deps/azure-36-32bit.yaml @@ -8,7 +8,6 @@ dependencies: # tools ### Cython 0.29.13 and pytest 5.0.1 for 32 bits are not available with conda, installing below with pip instead - pytest-xdist>=1.21 - - pytest-mock - hypothesis>=3.58.0 - pytest-azurepipelines diff --git a/ci/deps/azure-36-locale.yaml b/ci/deps/azure-36-locale.yaml index 3baf975afc096..c3c94e365c259 100644 --- a/ci/deps/azure-36-locale.yaml +++ b/ci/deps/azure-36-locale.yaml @@ -9,7 +9,6 @@ dependencies: - cython>=0.29.13 - pytest>=5.0.1 - pytest-xdist>=1.21 - - pytest-mock - hypothesis>=3.58.0 - pytest-azurepipelines diff --git a/ci/deps/azure-36-locale_slow.yaml b/ci/deps/azure-36-locale_slow.yaml index 01741e9b65a7a..46ddd44931848 100644 --- a/ci/deps/azure-36-locale_slow.yaml +++ b/ci/deps/azure-36-locale_slow.yaml @@ -9,7 +9,6 @@ dependencies: - cython>=0.29.13 - pytest>=5.0.1 - pytest-xdist>=1.21 - - pytest-mock - hypothesis>=3.58.0 - pytest-azurepipelines diff --git a/ci/deps/azure-36-minimum_versions.yaml b/ci/deps/azure-36-minimum_versions.yaml index 1e32ef7482be3..ff1095005fa85 100644 --- a/ci/deps/azure-36-minimum_versions.yaml +++ b/ci/deps/azure-36-minimum_versions.yaml @@ -9,7 +9,6 @@ dependencies: - cython=0.29.13 - pytest=5.0.1 - pytest-xdist>=1.21 - - pytest-mock - hypothesis>=3.58.0 - pytest-azurepipelines diff --git a/ci/deps/azure-37-locale.yaml b/ci/deps/azure-37-locale.yaml index 26446ab5365b1..3319afed173b5 100644 --- a/ci/deps/azure-37-locale.yaml +++ b/ci/deps/azure-37-locale.yaml @@ -9,7 +9,6 @@ dependencies: - cython>=0.29.13 - pytest>=5.0.1 - pytest-xdist>=1.21 - - pytest-mock - hypothesis>=3.58.0 - pytest-azurepipelines diff --git a/ci/deps/azure-37-numpydev.yaml b/ci/deps/azure-37-numpydev.yaml index 3264df5944e35..a04bdc2448bce 100644 --- a/ci/deps/azure-37-numpydev.yaml +++ b/ci/deps/azure-37-numpydev.yaml @@ -8,7 +8,6 @@ dependencies: - cython>=0.29.13 - pytest>=5.0.1 - pytest-xdist>=1.21 - - pytest-mock - hypothesis>=3.58.0 - pytest-azurepipelines diff --git a/ci/deps/azure-macos-36.yaml b/ci/deps/azure-macos-36.yaml index 48ba87d26f53d..831b68d0bb4d3 100644 --- a/ci/deps/azure-macos-36.yaml +++ b/ci/deps/azure-macos-36.yaml @@ -8,7 +8,6 @@ dependencies: - cython>=0.29.13 - pytest>=5.0.1 - pytest-xdist>=1.21 - - pytest-mock - hypothesis>=3.58.0 - pytest-azurepipelines diff --git a/ci/deps/azure-windows-36.yaml b/ci/deps/azure-windows-36.yaml index e3ad1d8371623..3aa261a57f2d4 100644 --- a/ci/deps/azure-windows-36.yaml +++ b/ci/deps/azure-windows-36.yaml @@ -9,7 +9,6 @@ dependencies: - cython>=0.29.13 - pytest>=5.0.1 - pytest-xdist>=1.21 - - pytest-mock - hypothesis>=3.58.0 - pytest-azurepipelines diff --git a/ci/deps/azure-windows-37.yaml b/ci/deps/azure-windows-37.yaml index 07e134b054c10..928896efd5fc4 100644 --- a/ci/deps/azure-windows-37.yaml +++ b/ci/deps/azure-windows-37.yaml @@ -9,7 +9,6 @@ dependencies: - cython>=0.29.13 - pytest>=5.0.1 - pytest-xdist>=1.21 - - pytest-mock - hypothesis>=3.58.0 - pytest-azurepipelines diff --git a/ci/deps/travis-36-cov.yaml b/ci/deps/travis-36-cov.yaml index 9148e0d4b29d9..170edd90ea3d7 100644 --- a/ci/deps/travis-36-cov.yaml +++ b/ci/deps/travis-36-cov.yaml @@ -9,7 +9,6 @@ dependencies: - cython>=0.29.13 - pytest>=5.0.1 - pytest-xdist>=1.21 - - pytest-mock - hypothesis>=3.58.0 - pytest-cov # this is only needed in the coverage build diff --git a/ci/deps/travis-36-locale.yaml b/ci/deps/travis-36-locale.yaml index 3199ee037bc0a..5dc1e4524ec86 100644 --- a/ci/deps/travis-36-locale.yaml +++ b/ci/deps/travis-36-locale.yaml @@ -9,7 +9,6 @@ dependencies: - cython>=0.29.13 - pytest>=5.0.1 - pytest-xdist>=1.21 - - pytest-mock - hypothesis>=3.58.0 # pandas dependencies diff --git a/ci/deps/travis-36-slow.yaml b/ci/deps/travis-36-slow.yaml index eab374c96772c..1dfd90d0904ac 100644 --- a/ci/deps/travis-36-slow.yaml +++ b/ci/deps/travis-36-slow.yaml @@ -9,7 +9,6 @@ dependencies: - cython>=0.29.13 - pytest>=5.0.1 - pytest-xdist>=1.21 - - pytest-mock - hypothesis>=3.58.0 # pandas dependencies diff --git a/ci/deps/travis-37.yaml b/ci/deps/travis-37.yaml index 7b75a427a4954..6826a9d072ff3 100644 --- a/ci/deps/travis-37.yaml +++ b/ci/deps/travis-37.yaml @@ -10,7 +10,6 @@ dependencies: - cython>=0.29.13 - pytest>=5.0.1 - pytest-xdist>=1.21 - - pytest-mock - hypothesis>=3.58.0 # pandas dependencies From 750fbeecd2b6b60d3bb04a3d66baa630cce6a424 Mon Sep 17 00:00:00 2001 From: Marc Garcia Date: Fri, 22 Nov 2019 15:25:52 +0000 Subject: [PATCH 159/185] DEPS: Unifying version of pytest-xdist across builds (#29783) --- ci/deps/travis-38.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ci/deps/travis-38.yaml b/ci/deps/travis-38.yaml index 88da1331b463a..828f02596a70e 100644 --- a/ci/deps/travis-38.yaml +++ b/ci/deps/travis-38.yaml @@ -8,7 +8,7 @@ dependencies: # tools - cython>=0.29.13 - pytest>=5.0.1 - - pytest-xdist>=1.29.0 # The rest of the builds use >=1.21, and use pytest-mock + - pytest-xdist>=1.21 - hypothesis>=3.58.0 # pandas dependencies From 36016baaaada9495628ea1bb7520bfe1dc9e50fb Mon Sep 17 00:00:00 2001 From: Marc Garcia Date: Fri, 22 Nov 2019 15:26:48 +0000 Subject: [PATCH 160/185] DOC: Updating documentation of new required pytest version (#29782) --- doc/source/development/contributing.rst | 2 +- doc/source/getting_started/install.rst | 2 +- doc/source/whatsnew/v1.0.0.rst | 1 + pandas/compat/_optional.py | 1 + pandas/util/_tester.py | 2 +- 5 files changed, 5 insertions(+), 3 deletions(-) diff --git a/doc/source/development/contributing.rst b/doc/source/development/contributing.rst index 042d6926d84f5..d7b3e159f8ce7 100644 --- a/doc/source/development/contributing.rst +++ b/doc/source/development/contributing.rst @@ -946,7 +946,7 @@ extensions in `numpy.testing .. note:: - The earliest supported pytest version is 4.0.2. + The earliest supported pytest version is 5.0.1. Writing tests ~~~~~~~~~~~~~ diff --git a/doc/source/getting_started/install.rst b/doc/source/getting_started/install.rst index 663948fd46cf6..3ba3a2fd4be1b 100644 --- a/doc/source/getting_started/install.rst +++ b/doc/source/getting_started/install.rst @@ -177,7 +177,7 @@ pandas is equipped with an exhaustive set of unit tests, covering about 97% of the code base as of this writing. To run it on your machine to verify that everything is working (and that you have all of the dependencies, soft and hard, installed), make sure you have `pytest -`__ >= 4.0.2 and `Hypothesis +`__ >= 5.0.1 and `Hypothesis `__ >= 3.58, then run: :: diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index 54640ff576338..6cbf655db4a44 100644 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -253,6 +253,7 @@ Other API changes See :ref:`units registration ` for more. - :meth:`Series.dropna` has dropped its ``**kwargs`` argument in favor of a single ``how`` parameter. Supplying anything else than ``how`` to ``**kwargs`` raised a ``TypeError`` previously (:issue:`29388`) +- When testing pandas, the new minimum required version of pytest is 5.0.1 (:issue:`29664`) - diff --git a/pandas/compat/_optional.py b/pandas/compat/_optional.py index fc66502710b0c..ce9079ce8864d 100644 --- a/pandas/compat/_optional.py +++ b/pandas/compat/_optional.py @@ -18,6 +18,7 @@ "pandas_gbq": "0.8.0", "pyarrow": "0.9.0", "pytables": "3.4.2", + "pytest": "5.0.1", "s3fs": "0.3.0", "scipy": "0.19.0", "sqlalchemy": "1.1.4", diff --git a/pandas/util/_tester.py b/pandas/util/_tester.py index 0f5324c8d02ba..7822ecdeeb4d8 100644 --- a/pandas/util/_tester.py +++ b/pandas/util/_tester.py @@ -11,7 +11,7 @@ def test(extra_args=None): try: import pytest except ImportError: - raise ImportError("Need pytest>=4.0.2 to run tests") + raise ImportError("Need pytest>=5.0.1 to run tests") try: import hypothesis # noqa except ImportError: From 3adc2e73c42763ac44f80e42255d232552fb1560 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Fri, 22 Nov 2019 07:27:33 -0800 Subject: [PATCH 161/185] CLN: remove LegacyFoo from io.pytables (#29787) --- doc/source/whatsnew/v1.0.0.rst | 1 + pandas/io/pytables.py | 77 +--------------------------------- 2 files changed, 2 insertions(+), 76 deletions(-) diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index 6cbf655db4a44..b015f439935cb 100644 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -322,6 +322,7 @@ or ``matplotlib.Axes.plot``. See :ref:`plotting.formatters` for more. - :meth:`pandas.Series.str.cat` now defaults to aligning ``others``, using ``join='left'`` (:issue:`27611`) - :meth:`pandas.Series.str.cat` does not accept list-likes *within* list-likes anymore (:issue:`27611`) - Removed the previously deprecated :meth:`ExtensionArray._formatting_values`. Use :attr:`ExtensionArray._formatter` instead. (:issue:`23601`) +- Removed support for legacy HDF5 formats (:issue:`29787`) - :func:`read_excel` removed support for "skip_footer" argument, use "skipfooter" instead (:issue:`18836`) - :meth:`DataFrame.to_records` no longer supports the argument "convert_datetime64" (:issue:`18902`) - Removed the previously deprecated ``IntervalIndex.from_intervals`` in favor of the :class:`IntervalIndex` constructor (:issue:`19263`) diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index ba53d8cfd0de5..c38bc1e48b029 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -179,9 +179,6 @@ class DuplicateWarning(Warning): # storer class map _STORER_MAP = { - "Series": "LegacySeriesFixed", - "DataFrame": "LegacyFrameFixed", - "DataMatrix": "LegacyFrameFixed", "series": "SeriesFixed", "frame": "FrameFixed", } @@ -3083,35 +3080,6 @@ def write_array(self, key: str, value, items=None): getattr(self.group, key)._v_attrs.transposed = transposed -class LegacyFixed(GenericFixed): - def read_index_legacy( - self, key: str, start: Optional[int] = None, stop: Optional[int] = None - ): - node = getattr(self.group, key) - data = node[start:stop] - kind = node._v_attrs.kind - return _unconvert_index_legacy( - data, kind, encoding=self.encoding, errors=self.errors - ) - - -class LegacySeriesFixed(LegacyFixed): - def read(self, **kwargs): - kwargs = self.validate_read(kwargs) - index = self.read_index_legacy("index") - values = self.read_array("values") - return Series(values, index=index) - - -class LegacyFrameFixed(LegacyFixed): - def read(self, **kwargs): - kwargs = self.validate_read(kwargs) - index = self.read_index_legacy("index") - columns = self.read_index_legacy("columns") - values = self.read_array("values") - return DataFrame(values, index=index, columns=columns) - - class SeriesFixed(GenericFixed): pandas_kind = "series" attributes = ["name"] @@ -4139,35 +4107,7 @@ def write(self, **kwargs): raise NotImplementedError("WORKTable needs to implement write") -class LegacyTable(Table): - """ an appendable table: allow append/query/delete operations to a - (possibly) already existing appendable table this table ALLOWS - append (but doesn't require them), and stores the data in a format - that can be easily searched - - """ - - _indexables: Optional[List[IndexCol]] = [ - IndexCol(name="index", axis=1, pos=0), - IndexCol(name="column", axis=2, pos=1, index_kind="columns_kind"), - DataCol(name="fields", cname="values", kind_attr="fields", pos=2), - ] - table_type = "legacy" - ndim = 3 - - def write(self, **kwargs): - raise TypeError("write operations are not allowed on legacy tables!") - - def read(self, where=None, columns=None, **kwargs): - """we have n indexable columns, with an arbitrary number of data - axes - """ - - if not self.read_axes(where=where, **kwargs): - return None - - -class AppendableTable(LegacyTable): +class AppendableTable(Table): """ support the new appendable table formats """ _indexables = None @@ -4866,21 +4806,6 @@ def _unconvert_index(data, kind, encoding=None, errors="strict"): return index -def _unconvert_index_legacy(data, kind, legacy=False, encoding=None, errors="strict"): - kind = _ensure_decoded(kind) - if kind == "datetime": - index = to_datetime(data) - elif kind in ("integer"): - index = np.asarray(data, dtype=object) - elif kind in ("string"): - index = _unconvert_string_array( - data, nan_rep=None, encoding=encoding, errors=errors - ) - else: # pragma: no cover - raise ValueError("unrecognized index type {kind}".format(kind=kind)) - return index - - def _convert_string_array(data, encoding, errors, itemsize=None): """ we take a string-like that is object dtype and coerce to a fixed size From e8c370aa59bb66d9125478407b5dfe31baff62bd Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Fri, 22 Nov 2019 07:29:00 -0800 Subject: [PATCH 162/185] REF: avoid returning self in io.pytables (#29776) --- pandas/io/pytables.py | 52 ++++++++++++++----------------------------- 1 file changed, 17 insertions(+), 35 deletions(-) diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index c38bc1e48b029..ce20d028525f8 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -1757,18 +1757,11 @@ def __init__( assert isinstance(self.cname, str) assert isinstance(self.kind_attr, str) - def set_axis(self, axis: int): - """ set the axis over which I index """ - self.axis = axis - - return self - def set_pos(self, pos: int): """ set the position of this column in the Table """ self.pos = pos if pos is not None and self.typ is not None: self.typ._v_pos = pos - return self def __repr__(self) -> str: temp = tuple( @@ -1843,8 +1836,6 @@ def convert( self.values = _set_tz(self.values, self.tz) - return self - def take_data(self): """ return the values & release the memory """ self.values, values = None, self.values @@ -1965,8 +1956,6 @@ def update_info(self, info): if value is not None or existing_value is not None: idx[key] = value - return self - def set_info(self, info): """ set my state from the passed info """ idx = info.get(self.name) @@ -2039,14 +2028,10 @@ def convert( """ assert self.table is not None # for mypy - assert self.table is not None - _start = start if start is not None else 0 _stop = min(stop, self.table.nrows) if stop is not None else self.table.nrows self.values = Int64Index(np.arange(_stop - _start)) - return self - def get_attr(self): pass @@ -2486,8 +2471,6 @@ def convert(self, values, nan_rep, encoding, errors, start=None, stop=None): self.data, nan_rep=nan_rep, encoding=encoding, errors=errors ) - return self - def get_attr(self): """ get the data for this column """ self.values = getattr(self.attrs, self.kind_attr, None) @@ -3768,9 +3751,12 @@ def create_axes( if i in axes: name = obj._AXIS_NAMES[i] - index_axes_map[i] = _convert_index( + new_index = _convert_index( name, a, self.encoding, self.errors, self.format_type - ).set_axis(i) + ) + new_index.axis = i + index_axes_map[i] = new_index + else: # we might be able to change the axes on the appending data if @@ -3797,10 +3783,12 @@ def create_axes( self.non_index_axes.append((i, append_axis)) # set axis positions (based on the axes) - self.index_axes = [ - index_axes_map[a].set_pos(j).update_info(self.info) - for j, a in enumerate(axes) - ] + new_index_axes = [index_axes_map[a] for a in axes] + for j, iax in enumerate(new_index_axes): + iax.set_pos(j) + iax.update_info(self.info) + self.index_axes = new_index_axes + j = len(self.index_axes) # check for column conflicts @@ -4069,19 +4057,13 @@ def read_column( # column must be an indexable or a data column c = getattr(self.table.cols, column) a.set_info(self.info) - return Series( - _set_tz( - a.convert( - c[start:stop], - nan_rep=self.nan_rep, - encoding=self.encoding, - errors=self.errors, - ).take_data(), - a.tz, - True, - ), - name=column, + a.convert( + c[start:stop], + nan_rep=self.nan_rep, + encoding=self.encoding, + errors=self.errors, ) + return Series(_set_tz(a.take_data(), a.tz, True), name=column) raise KeyError("column [{column}] not found in the table".format(column=column)) From 1819bbbfb5e8887bcc2bfbfcaf75e83f61ffc27c Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Fri, 22 Nov 2019 07:30:16 -0800 Subject: [PATCH 163/185] CLN: typing and renaming in computation.pytables (#29778) --- pandas/core/computation/pytables.py | 52 +++++++++++++++------------ pandas/core/computation/scope.py | 2 +- pandas/io/pytables.py | 6 ++-- pandas/tests/computation/test_eval.py | 2 +- 4 files changed, 34 insertions(+), 28 deletions(-) diff --git a/pandas/core/computation/pytables.py b/pandas/core/computation/pytables.py index 8dee273517f88..58bbfd0a1bdee 100644 --- a/pandas/core/computation/pytables.py +++ b/pandas/core/computation/pytables.py @@ -21,7 +21,7 @@ from pandas.io.formats.printing import pprint_thing, pprint_thing_encoded -class Scope(_scope.Scope): +class PyTablesScope(_scope.Scope): __slots__ = ("queryables",) queryables: Dict[str, Any] @@ -38,13 +38,13 @@ def __init__( class Term(ops.Term): - env: Scope + env: PyTablesScope def __new__(cls, name, env, side=None, encoding=None): klass = Constant if not isinstance(name, str) else cls return object.__new__(klass) - def __init__(self, name, env: Scope, side=None, encoding=None): + def __init__(self, name, env: PyTablesScope, side=None, encoding=None): super().__init__(name, env, side=side, encoding=encoding) def _resolve_name(self): @@ -68,7 +68,8 @@ def value(self): class Constant(Term): - def __init__(self, value, env, side=None, encoding=None): + def __init__(self, value, env: PyTablesScope, side=None, encoding=None): + assert isinstance(env, PyTablesScope), type(env) super().__init__(value, env, side=side, encoding=encoding) def _resolve_name(self): @@ -270,7 +271,7 @@ def evaluate(self): raise ValueError("query term is not valid [{slf}]".format(slf=self)) rhs = self.conform(self.rhs) - values = [TermValue(v, v, self.kind).value for v in rhs] + values = list(rhs) if self.is_in_table: @@ -386,7 +387,7 @@ def prune(self, klass): return None -class ExprVisitor(BaseExprVisitor): +class PyTablesExprVisitor(BaseExprVisitor): const_type = Constant term_type = Term @@ -486,25 +487,29 @@ def _validate_where(w): TypeError : An invalid data type was passed in for w (e.g. dict). """ - if not (isinstance(w, (Expr, str)) or is_list_like(w)): - raise TypeError("where must be passed as a string, Expr, or list-like of Exprs") + if not (isinstance(w, (PyTablesExpr, str)) or is_list_like(w)): + raise TypeError( + "where must be passed as a string, PyTablesExpr, " + "or list-like of PyTablesExpr" + ) return w -class Expr(expr.Expr): - """ hold a pytables like expression, comprised of possibly multiple 'terms' +class PyTablesExpr(expr.Expr): + """ + Hold a pytables-like expression, comprised of possibly multiple 'terms'. Parameters ---------- - where : string term expression, Expr, or list-like of Exprs + where : string term expression, PyTablesExpr, or list-like of PyTablesExprs queryables : a "kinds" map (dict of column name -> kind), or None if column is non-indexable encoding : an encoding that will encode the query terms Returns ------- - an Expr object + a PyTablesExpr object Examples -------- @@ -520,8 +525,8 @@ class Expr(expr.Expr): "major_axis>=20130101" """ - _visitor: Optional[ExprVisitor] - env: Scope + _visitor: Optional[PyTablesExprVisitor] + env: PyTablesScope def __init__( self, @@ -542,14 +547,14 @@ def __init__( # capture the environment if needed local_dict = DeepChainMap() - if isinstance(where, Expr): + if isinstance(where, PyTablesExpr): local_dict = where.env.scope _where = where.expr elif isinstance(where, (list, tuple)): where = list(where) for idx, w in enumerate(where): - if isinstance(w, Expr): + if isinstance(w, PyTablesExpr): local_dict = w.env.scope else: w = _validate_where(w) @@ -559,11 +564,11 @@ def __init__( _where = where self.expr = _where - self.env = Scope(scope_level + 1, local_dict=local_dict) + self.env = PyTablesScope(scope_level + 1, local_dict=local_dict) if queryables is not None and isinstance(self.expr, str): self.env.queryables.update(queryables) - self._visitor = ExprVisitor( + self._visitor = PyTablesExprVisitor( self.env, queryables=queryables, parser="pytables", @@ -601,30 +606,31 @@ def evaluate(self): class TermValue: """ hold a term value the we use to construct a condition/filter """ - def __init__(self, value, converted, kind: Optional[str]): + def __init__(self, value, converted, kind: str): + assert isinstance(kind, str), kind self.value = value self.converted = converted self.kind = kind - def tostring(self, encoding): + def tostring(self, encoding) -> str: """ quote the string if not encoded else encode and return """ if self.kind == "string": if encoding is not None: - return self.converted + return str(self.converted) return '"{converted}"'.format(converted=self.converted) elif self.kind == "float": # python 2 str(float) is not always # round-trippable so use repr() return repr(self.converted) - return self.converted + return str(self.converted) def maybe_expression(s) -> bool: """ loose checking if s is a pytables-acceptable expression """ if not isinstance(s, str): return False - ops = ExprVisitor.binary_ops + ExprVisitor.unary_ops + ("=",) + ops = PyTablesExprVisitor.binary_ops + PyTablesExprVisitor.unary_ops + ("=",) # make sure we have an op at least return any(op in s for op in ops) diff --git a/pandas/core/computation/scope.py b/pandas/core/computation/scope.py index 71aa885816670..78a47afcc0830 100644 --- a/pandas/core/computation/scope.py +++ b/pandas/core/computation/scope.py @@ -162,7 +162,7 @@ def has_resolvers(self) -> bool: """ return bool(len(self.resolvers)) - def resolve(self, key, is_local): + def resolve(self, key: str, is_local: bool): """ Resolve a variable name in a possibly local context. diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index ce20d028525f8..7ac434f45e180 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -48,7 +48,7 @@ from pandas.core.arrays.categorical import Categorical from pandas.core.arrays.sparse import BlockIndex, IntIndex import pandas.core.common as com -from pandas.core.computation.pytables import Expr, maybe_expression +from pandas.core.computation.pytables import PyTablesExpr, maybe_expression from pandas.core.index import ensure_index from pandas.core.internals import BlockManager, _block_shape, make_block @@ -93,7 +93,7 @@ def _ensure_str(name): return name -Term = Expr +Term = PyTablesExpr def _ensure_term(where, scope_level: int): @@ -4954,7 +4954,7 @@ def generate(self, where): q = self.table.queryables() try: - return Expr(where, queryables=q, encoding=self.table.encoding) + return PyTablesExpr(where, queryables=q, encoding=self.table.encoding) except NameError: # raise a nice message, suggesting that the user should use # data_columns diff --git a/pandas/tests/computation/test_eval.py b/pandas/tests/computation/test_eval.py index c6ce08080314a..66e8e1bebfe98 100644 --- a/pandas/tests/computation/test_eval.py +++ b/pandas/tests/computation/test_eval.py @@ -1891,7 +1891,7 @@ def test_invalid_parser(): _parsers: Dict[str, Type[BaseExprVisitor]] = { "python": PythonExprVisitor, - "pytables": pytables.ExprVisitor, + "pytables": pytables.PyTablesExprVisitor, "pandas": PandasExprVisitor, } From 11993e0b7f50d49892ad112ac07335b81c6a6998 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Fri, 22 Nov 2019 07:30:57 -0800 Subject: [PATCH 164/185] TYP: io.pytables types (#29777) --- pandas/io/pytables.py | 26 ++++++++++++++------------ 1 file changed, 14 insertions(+), 12 deletions(-) diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index 7ac434f45e180..8afbd293a095b 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -1401,7 +1401,7 @@ def _check_if_open(self): if not self.is_open: raise ClosedFileError("{0} file is not open!".format(self._path)) - def _validate_format(self, format, kwargs): + def _validate_format(self, format: str, kwargs: Dict[str, Any]) -> Dict[str, Any]: """ validate / deprecate formats; return the new kwargs """ kwargs = kwargs.copy() @@ -1594,10 +1594,9 @@ class TableIterator: stop : the passed stop value (default is None) iterator : bool, default False Whether to use the default iterator. - chunksize : the passed chunking value (default is 50000) + chunksize : the passed chunking value (default is 100000) auto_close : boolean, automatically close the store at the end of iteration, default is False - kwargs : the passed kwargs """ chunksize: Optional[int] @@ -1613,7 +1612,7 @@ def __init__( start=None, stop=None, iterator: bool = False, - chunksize=None, + chunksize: Optional[int] = None, auto_close: bool = False, ): self.store = store @@ -3399,15 +3398,14 @@ def _get_metadata_path(self, key) -> str: """ return the metadata pathname for this key """ return "{group}/meta/{key}/meta".format(group=self.group._v_pathname, key=key) - def write_metadata(self, key, values): + def write_metadata(self, key: str, values): """ write out a meta data array to the key as a fixed-format Series Parameters ---------- - key : string + key : str values : ndarray - """ values = Series(values) self.parent.put( @@ -3419,7 +3417,7 @@ def write_metadata(self, key, values): nan_rep=self.nan_rep, ) - def read_metadata(self, key): + def read_metadata(self, key: str): """ return the meta data array for this key """ if getattr(getattr(self.group, "meta", None), key, None) is not None: return self.parent.select(self._get_metadata_path(key)) @@ -3966,7 +3964,11 @@ def process_filter(field, filt): return obj def create_description( - self, complib=None, complevel=None, fletcher32: bool = False, expectedrows=None + self, + complib=None, + complevel=None, + fletcher32: bool = False, + expectedrows: Optional[int] = None, ): """ create the description of the table from the axes & values """ @@ -4150,7 +4152,7 @@ def write( # add the rows self.write_data(chunksize, dropna=dropna) - def write_data(self, chunksize, dropna=False): + def write_data(self, chunksize: Optional[int], dropna: bool = False): """ we form the data into a 2-d including indexes,values,mask write chunk-by-chunk """ @@ -4478,7 +4480,7 @@ class GenericTable(AppendableFrameTable): obj_type = DataFrame @property - def pandas_type(self): + def pandas_type(self) -> str: return self.pandas_kind @property @@ -4530,7 +4532,7 @@ class AppendableMultiFrameTable(AppendableFrameTable): _re_levels = re.compile(r"^level_\d+$") @property - def table_type_short(self): + def table_type_short(self) -> str: return "appendable_multi" def write(self, obj, data_columns=None, **kwargs): From 41e9e9bdeebe7e0d25e412d1bbfd57f49171bb9e Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Fri, 22 Nov 2019 07:31:18 -0800 Subject: [PATCH 165/185] CLN: catch ValueError instead of Exceeption (#29785) --- pandas/io/html.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/io/html.py b/pandas/io/html.py index ed2b21994fdca..5f38f866e1643 100644 --- a/pandas/io/html.py +++ b/pandas/io/html.py @@ -896,7 +896,7 @@ def _parse(flavor, io, match, attrs, encoding, displayed_only, **kwargs): try: tables = p.parse_tables() - except Exception as caught: + except ValueError as caught: # if `io` is an io-like object, check if it's seekable # and try to rewind it before trying the next parser if hasattr(io, "seekable") and io.seekable(): From 24684e281a62c47a61cefc5755e3700905260604 Mon Sep 17 00:00:00 2001 From: MomIsBestFriend <50263213+MomIsBestFriend@users.noreply.github.com> Date: Fri, 22 Nov 2019 17:32:43 +0200 Subject: [PATCH 166/185] CLN:F-string in pandas/_libs/tslibs/*.pyx (#29775) --- pandas/_libs/tslibs/c_timestamp.pyx | 24 +++++----- pandas/_libs/tslibs/conversion.pyx | 8 ++-- pandas/_libs/tslibs/fields.pyx | 11 +++-- pandas/_libs/tslibs/frequencies.pyx | 2 +- pandas/_libs/tslibs/nattype.pyx | 12 +++-- pandas/_libs/tslibs/np_datetime.pyx | 8 ++-- pandas/_libs/tslibs/offsets.pyx | 19 ++++---- pandas/_libs/tslibs/parsing.pyx | 26 +++++------ pandas/_libs/tslibs/period.pyx | 44 +++++++++--------- pandas/_libs/tslibs/strptime.pyx | 32 +++++++------ pandas/_libs/tslibs/timedeltas.pyx | 67 +++++++++++----------------- pandas/_libs/tslibs/timestamps.pyx | 12 ++--- pandas/_libs/tslibs/timezones.pyx | 4 +- pandas/_libs/tslibs/tzconversion.pyx | 16 +++---- 14 files changed, 127 insertions(+), 158 deletions(-) diff --git a/pandas/_libs/tslibs/c_timestamp.pyx b/pandas/_libs/tslibs/c_timestamp.pyx index 8512b34b9e78c..c6c98e996b745 100644 --- a/pandas/_libs/tslibs/c_timestamp.pyx +++ b/pandas/_libs/tslibs/c_timestamp.pyx @@ -55,11 +55,11 @@ def maybe_integer_op_deprecated(obj): # GH#22535 add/sub of integers and int-arrays is deprecated if obj.freq is not None: warnings.warn("Addition/subtraction of integers and integer-arrays " - "to {cls} is deprecated, will be removed in a future " + f"to {type(obj).__name__} is deprecated, " + "will be removed in a future " "version. Instead of adding/subtracting `n`, use " "`n * self.freq`" - .format(cls=type(obj).__name__), - FutureWarning) + , FutureWarning) cdef class _Timestamp(datetime): @@ -144,11 +144,10 @@ cdef class _Timestamp(datetime): # e.g. tzlocal has no `strftime` pass - tz = ", tz='{0}'".format(zone) if zone is not None else "" - freq = "" if self.freq is None else ", freq='{0}'".format(self.freqstr) + tz = f", tz='{zone}'" if zone is not None else "" + freq = "" if self.freq is None else f", freq='{self.freqstr}'" - return "Timestamp('{stamp}'{tz}{freq})".format(stamp=stamp, - tz=tz, freq=freq) + return f"Timestamp('{stamp}'{tz}{freq})" cdef bint _compare_outside_nanorange(_Timestamp self, datetime other, int op) except -1: @@ -370,23 +369,22 @@ cdef class _Timestamp(datetime): @property def _repr_base(self) -> str: - return '{date} {time}'.format(date=self._date_repr, - time=self._time_repr) + return f"{self._date_repr} {self._time_repr}" @property def _date_repr(self) -> str: # Ideal here would be self.strftime("%Y-%m-%d"), but # the datetime strftime() methods require year >= 1900 - return '%d-%.2d-%.2d' % (self.year, self.month, self.day) + return f'{self.year}-{self.month:02d}-{self.day:02d}' @property def _time_repr(self) -> str: - result = '%.2d:%.2d:%.2d' % (self.hour, self.minute, self.second) + result = f'{self.hour:02d}:{self.minute:02d}:{self.second:02d}' if self.nanosecond != 0: - result += '.%.9d' % (self.nanosecond + 1000 * self.microsecond) + result += f'.{self.nanosecond + 1000 * self.microsecond:09d}' elif self.microsecond != 0: - result += '.%.6d' % self.microsecond + result += f'.{self.microsecond:06d}' return result diff --git a/pandas/_libs/tslibs/conversion.pyx b/pandas/_libs/tslibs/conversion.pyx index bd74180403ad9..c5315219b8422 100644 --- a/pandas/_libs/tslibs/conversion.pyx +++ b/pandas/_libs/tslibs/conversion.pyx @@ -197,7 +197,7 @@ def datetime_to_datetime64(object[:] values): iresult[i] = pydatetime_to_dt64(val, &dts) check_dts_bounds(&dts) else: - raise TypeError('Unrecognized value type: %s' % type(val)) + raise TypeError(f'Unrecognized value type: {type(val)}') return result, inferred_tz @@ -326,8 +326,8 @@ cdef convert_to_tsobject(object ts, object tz, object unit, raise ValueError("Cannot convert Period to Timestamp " "unambiguously. Use to_timestamp") else: - raise TypeError('Cannot convert input [{}] of type {} to ' - 'Timestamp'.format(ts, type(ts))) + raise TypeError(f'Cannot convert input [{ts}] of type {type(ts)} to ' + f'Timestamp') if tz is not None: localize_tso(obj, tz) @@ -686,7 +686,7 @@ def normalize_date(dt: object) -> datetime: elif PyDate_Check(dt): return datetime(dt.year, dt.month, dt.day) else: - raise TypeError('Unrecognized type: %s' % type(dt)) + raise TypeError(f'Unrecognized type: {type(dt)}') @cython.wraparound(False) diff --git a/pandas/_libs/tslibs/fields.pyx b/pandas/_libs/tslibs/fields.pyx index 8f5c8d10776df..dfed8d06530aa 100644 --- a/pandas/_libs/tslibs/fields.pyx +++ b/pandas/_libs/tslibs/fields.pyx @@ -130,7 +130,7 @@ def get_date_name_field(const int64_t[:] dtindex, object field, object locale=No out[i] = names[dts.month].capitalize() else: - raise ValueError("Field {field} not supported".format(field=field)) + raise ValueError(f"Field {field} not supported") return out @@ -165,8 +165,7 @@ def get_start_end_field(const int64_t[:] dtindex, object field, if freqstr: if freqstr == 'C': - raise ValueError("Custom business days is not supported by {field}" - .format(field=field)) + raise ValueError(f"Custom business days is not supported by {field}") is_business = freqstr[0] == 'B' # YearBegin(), BYearBegin() use month = starting month of year. @@ -373,7 +372,7 @@ def get_start_end_field(const int64_t[:] dtindex, object field, out[i] = 1 else: - raise ValueError("Field {field} not supported".format(field=field)) + raise ValueError(f"Field {field} not supported") return out.view(bool) @@ -537,7 +536,7 @@ def get_date_field(const int64_t[:] dtindex, object field): elif field == 'is_leap_year': return isleapyear_arr(get_date_field(dtindex, 'Y')) - raise ValueError("Field {field} not supported".format(field=field)) + raise ValueError(f"Field {field} not supported") @cython.wraparound(False) @@ -653,7 +652,7 @@ def get_timedelta_field(const int64_t[:] tdindex, object field): out[i] = tds.nanoseconds return out - raise ValueError("Field %s not supported" % field) + raise ValueError(f"Field {field} not supported") cpdef isleapyear_arr(ndarray years): diff --git a/pandas/_libs/tslibs/frequencies.pyx b/pandas/_libs/tslibs/frequencies.pyx index b29c841896072..660f4ddcec736 100644 --- a/pandas/_libs/tslibs/frequencies.pyx +++ b/pandas/_libs/tslibs/frequencies.pyx @@ -197,7 +197,7 @@ cpdef _base_and_stride(str freqstr): groups = opattern.match(freqstr) if not groups: - raise ValueError("Could not evaluate {freq}".format(freq=freqstr)) + raise ValueError(f"Could not evaluate {freqstr}") stride = groups.group(1) diff --git a/pandas/_libs/tslibs/nattype.pyx b/pandas/_libs/tslibs/nattype.pyx index 3ddce28fb6dd1..6e5e62e77b4b1 100644 --- a/pandas/_libs/tslibs/nattype.pyx +++ b/pandas/_libs/tslibs/nattype.pyx @@ -115,8 +115,8 @@ cdef class _NaT(datetime): if is_datetime64_object(other): return _nat_scalar_rules[op] else: - raise TypeError('Cannot compare type %r with type %r' % - (type(self).__name__, type(other).__name__)) + raise TypeError(f'Cannot compare type {type(self).__name__} ' + f'with type {type(other).__name__}') # Note: instead of passing "other, self, _reverse_ops[op]", we observe # that `_nat_scalar_rules` is invariant under `_reverse_ops`, @@ -150,8 +150,7 @@ cdef class _NaT(datetime): result = np.empty(other.shape, dtype="datetime64[ns]") result.fill("NaT") return result - raise TypeError("Cannot add NaT to ndarray with dtype {dtype}" - .format(dtype=other.dtype)) + raise TypeError(f"Cannot add NaT to ndarray with dtype {other.dtype}") return NotImplemented @@ -203,9 +202,8 @@ cdef class _NaT(datetime): result.fill("NaT") return result - raise TypeError( - "Cannot subtract NaT from ndarray with dtype {dtype}" - .format(dtype=other.dtype)) + raise TypeError(f"Cannot subtract NaT from ndarray with " + f"dtype {other.dtype}") return NotImplemented diff --git a/pandas/_libs/tslibs/np_datetime.pyx b/pandas/_libs/tslibs/np_datetime.pyx index e76f84265a327..b9406074bb130 100644 --- a/pandas/_libs/tslibs/np_datetime.pyx +++ b/pandas/_libs/tslibs/np_datetime.pyx @@ -112,11 +112,9 @@ cdef inline check_dts_bounds(npy_datetimestruct *dts): error = True if error: - fmt = '%d-%.2d-%.2d %.2d:%.2d:%.2d' % (dts.year, dts.month, - dts.day, dts.hour, - dts.min, dts.sec) - raise OutOfBoundsDatetime( - 'Out of bounds nanosecond timestamp: {fmt}'.format(fmt=fmt)) + fmt = (f'{dts.year}-{dts.month:02d}-{dts.day:02d} ' + f'{dts.hour:02d}:{dts.min:02d}:{dts.sec:02d}') + raise OutOfBoundsDatetime(f'Out of bounds nanosecond timestamp: {fmt}') # ---------------------------------------------------------------------- diff --git a/pandas/_libs/tslibs/offsets.pyx b/pandas/_libs/tslibs/offsets.pyx index 434252677f1a1..68a0a4a403c81 100644 --- a/pandas/_libs/tslibs/offsets.pyx +++ b/pandas/_libs/tslibs/offsets.pyx @@ -66,16 +66,16 @@ need_suffix = ['QS', 'BQ', 'BQS', 'YS', 'AS', 'BY', 'BA', 'BYS', 'BAS'] for __prefix in need_suffix: for _m in MONTHS: - key = '%s-%s' % (__prefix, _m) + key = f'{__prefix}-{_m}' _offset_to_period_map[key] = _offset_to_period_map[__prefix] for __prefix in ['A', 'Q']: for _m in MONTHS: - _alias = '%s-%s' % (__prefix, _m) + _alias = f'{__prefix}-{_m}' _offset_to_period_map[_alias] = _alias for _d in DAYS: - _offset_to_period_map['W-%s' % _d] = 'W-%s' % _d + _offset_to_period_map[f'W-{_d}'] = f'W-{_d}' # --------------------------------------------------------------------- @@ -432,9 +432,9 @@ class _BaseOffset: n_str = "" if self.n != 1: - n_str = "%s * " % self.n + n_str = f"{self.n} * " - out = '<%s' % n_str + className + plural + self._repr_attrs() + '>' + out = f'<{n_str}{className}{plural}{self._repr_attrs()}>' return out def _get_offset_day(self, datetime other): @@ -460,16 +460,13 @@ class _BaseOffset: ValueError if n != int(n) """ if util.is_timedelta64_object(n): - raise TypeError('`n` argument must be an integer, ' - 'got {ntype}'.format(ntype=type(n))) + raise TypeError(f'`n` argument must be an integer, got {type(n)}') try: nint = int(n) except (ValueError, TypeError): - raise TypeError('`n` argument must be an integer, ' - 'got {ntype}'.format(ntype=type(n))) + raise TypeError(f'`n` argument must be an integer, got {type(n)}') if n != nint: - raise ValueError('`n` argument must be an integer, ' - 'got {n}'.format(n=n)) + raise ValueError(f'`n` argument must be an integer, got {n}') return nint def __setstate__(self, state): diff --git a/pandas/_libs/tslibs/parsing.pyx b/pandas/_libs/tslibs/parsing.pyx index 8fe724fa2f6f7..ecf3e35c86d76 100644 --- a/pandas/_libs/tslibs/parsing.pyx +++ b/pandas/_libs/tslibs/parsing.pyx @@ -153,7 +153,7 @@ cdef inline object _parse_delimited_date(object date_string, bint dayfirst): return datetime_new(year, month, day, 0, 0, 0, 0, None), reso return datetime(year, month, day, 0, 0, 0, 0, None), reso - raise DateParseError("Invalid date specified ({}/{})".format(month, day)) + raise DateParseError(f"Invalid date specified ({month}/{day})") cdef inline bint does_string_look_like_time(object parse_string): @@ -311,7 +311,7 @@ cdef parse_datetime_string_with_reso(date_string, freq=None, dayfirst=False, # TODO: allow raise of errors within instead raise DateParseError(err) if parsed is None: - raise DateParseError("Could not parse {dstr}".format(dstr=date_string)) + raise DateParseError(f"Could not parse {date_string}") return parsed, parsed, reso @@ -420,18 +420,18 @@ cdef inline object _parse_dateabbr_string(object date_string, object default, raise ValueError if not (1 <= quarter <= 4): - msg = ('Incorrect quarterly string is given, quarter must be ' - 'between 1 and 4: {dstr}') - raise DateParseError(msg.format(dstr=date_string)) + raise DateParseError(f'Incorrect quarterly string is given, ' + f'quarter must be ' + f'between 1 and 4: {date_string}') if freq is not None: # hack attack, #1228 try: mnum = MONTH_NUMBERS[_get_rule_month(freq)] + 1 except (KeyError, ValueError): - msg = ('Unable to retrieve month information from given ' - 'freq: {freq}'.format(freq=freq)) - raise DateParseError(msg) + raise DateParseError(f'Unable to retrieve month ' + f'information from given ' + f'freq: {freq}') month = (mnum + (quarter - 1) * 3) % 12 + 1 if month > mnum: @@ -464,7 +464,7 @@ cdef inline object _parse_dateabbr_string(object date_string, object default, except ValueError: pass - raise ValueError('Unable to parse {0}'.format(date_string)) + raise ValueError(f'Unable to parse {date_string}') cdef dateutil_parse(object timestr, object default, ignoretz=False, @@ -484,8 +484,7 @@ cdef dateutil_parse(object timestr, object default, ignoretz=False, res, _ = res if res is None: - msg = "Unknown datetime string format, unable to parse: {timestr}" - raise ValueError(msg.format(timestr=timestr)) + raise ValueError(f"Unknown datetime string format, unable to parse: {timestr}") for attr in ["year", "month", "day", "hour", "minute", "second", "microsecond"]: @@ -495,8 +494,7 @@ cdef dateutil_parse(object timestr, object default, ignoretz=False, reso = attr if reso is None: - msg = "Unable to parse datetime string: {timestr}" - raise ValueError(msg.format(timestr=timestr)) + raise ValueError(f"Unable to parse datetime string: {timestr}") if reso == 'microsecond': if repl['microsecond'] == 0: @@ -710,7 +708,7 @@ class _timelex: elif getattr(instream, 'read', None) is None: raise TypeError( 'Parser must be a string or character stream, not ' - '{itype}'.format(itype=instream.__class__.__name__)) + f'{type(instream).__name__}') else: self.stream = instream.read() diff --git a/pandas/_libs/tslibs/period.pyx b/pandas/_libs/tslibs/period.pyx index 2512fdb891e3e..80db081a4fc52 100644 --- a/pandas/_libs/tslibs/period.pyx +++ b/pandas/_libs/tslibs/period.pyx @@ -1227,7 +1227,7 @@ def period_format(int64_t value, int freq, object fmt=None): elif freq_group == 12000: # NANOSEC fmt = b'%Y-%m-%d %H:%M:%S.%n' else: - raise ValueError('Unknown freq: {freq}'.format(freq=freq)) + raise ValueError(f'Unknown freq: {freq}') return _period_strftime(value, freq, fmt) @@ -1273,17 +1273,17 @@ cdef object _period_strftime(int64_t value, int freq, object fmt): raise ValueError('Unable to get quarter and year') if i == 0: - repl = '%d' % quarter + repl = str(quarter) elif i == 1: # %f, 2-digit year - repl = '%.2d' % (year % 100) + repl = f'{(year % 100):02d}' elif i == 2: - repl = '%d' % year + repl = str(year) elif i == 3: - repl = '%03d' % (value % 1000) + repl = f'{(value % 1_000):03d}' elif i == 4: - repl = '%06d' % (value % 1000000) + repl = f'{(value % 1_000_000):06d}' elif i == 5: - repl = '%09d' % (value % 1000000000) + repl = f'{(value % 1_000_000_000):09d}' result = result.replace(str_extra_fmts[i], repl) @@ -1391,7 +1391,7 @@ def get_period_field_arr(int code, int64_t[:] arr, int freq): func = _get_accessor_func(code) if func is NULL: - raise ValueError('Unrecognized period code: {code}'.format(code=code)) + raise ValueError(f'Unrecognized period code: {code}') sz = len(arr) out = np.empty(sz, dtype=np.int64) @@ -1578,9 +1578,8 @@ cdef class _Period: freq = to_offset(freq) if freq.n <= 0: - raise ValueError('Frequency must be positive, because it' - ' represents span: {freqstr}' - .format(freqstr=freq.freqstr)) + raise ValueError(f'Frequency must be positive, because it ' + f'represents span: {freq.freqstr}') return freq @@ -1614,9 +1613,8 @@ cdef class _Period: return NotImplemented elif op == Py_NE: return NotImplemented - raise TypeError('Cannot compare type {cls} with type {typ}' - .format(cls=type(self).__name__, - typ=type(other).__name__)) + raise TypeError(f'Cannot compare type {type(self).__name__} ' + f'with type {type(other).__name__}') def __hash__(self): return hash((self.ordinal, self.freqstr)) @@ -1634,8 +1632,8 @@ cdef class _Period: if nanos % offset_nanos == 0: ordinal = self.ordinal + (nanos // offset_nanos) return Period(ordinal=ordinal, freq=self.freq) - msg = 'Input cannot be converted to Period(freq={0})' - raise IncompatibleFrequency(msg.format(self.freqstr)) + raise IncompatibleFrequency(f'Input cannot be converted to ' + f'Period(freq={self.freqstr})') elif util.is_offset_object(other): freqstr = other.rule_code base = get_base_alias(freqstr) @@ -1665,9 +1663,8 @@ cdef class _Period: # GH#17983 sname = type(self).__name__ oname = type(other).__name__ - raise TypeError("unsupported operand type(s) for +: '{self}' " - "and '{other}'".format(self=sname, - other=oname)) + raise TypeError(f"unsupported operand type(s) for +: '{sname}' " + f"and '{oname}'") else: # pragma: no cover return NotImplemented elif is_period_object(other): @@ -2218,7 +2215,7 @@ cdef class _Period: def __repr__(self) -> str: base, mult = get_freq_code(self.freq) formatted = period_format(self.ordinal, base) - return "Period('%s', '%s')" % (formatted, self.freqstr) + return f"Period('{formatted}', '{self.freqstr}')" def __str__(self) -> str: """ @@ -2226,7 +2223,7 @@ cdef class _Period: """ base, mult = get_freq_code(self.freq) formatted = period_format(self.ordinal, base) - value = ("%s" % formatted) + value = str(formatted) return value def __setstate__(self, state): @@ -2477,9 +2474,8 @@ class Period(_Period): try: freq = Resolution.get_freq(reso) except KeyError: - raise ValueError( - "Invalid frequency or could not infer: {reso}" - .format(reso=reso)) + raise ValueError(f"Invalid frequency or could not " + f"infer: {reso}") elif PyDateTime_Check(value): dt = value diff --git a/pandas/_libs/tslibs/strptime.pyx b/pandas/_libs/tslibs/strptime.pyx index fbda5f178e164..71e5006761097 100644 --- a/pandas/_libs/tslibs/strptime.pyx +++ b/pandas/_libs/tslibs/strptime.pyx @@ -106,11 +106,11 @@ def array_strptime(object[:] values, object fmt, if bad_directive == "\\": bad_directive = "%" del err - raise ValueError("'%s' is a bad directive in format '%s'" % - (bad_directive, fmt)) + raise ValueError(f"'{bad_directive}' is a bad directive " + f"in format '{fmt}'") # IndexError only occurs when the format string is "%" except IndexError: - raise ValueError("stray %% in format '%s'" % fmt) + raise ValueError("stray % in format '{fmt}'") _regex_cache[fmt] = format_regex result = np.empty(n, dtype='M8[ns]') @@ -139,14 +139,13 @@ def array_strptime(object[:] values, object fmt, if is_coerce: iresult[i] = NPY_NAT continue - raise ValueError("time data %r does not match " - "format %r (match)" % (val, fmt)) + raise ValueError(f"time data '{val}' does not match " + f"format '{fmt}' (match)") if len(val) != found.end(): if is_coerce: iresult[i] = NPY_NAT continue - raise ValueError("unconverted data remains: %s" % - val[found.end():]) + raise ValueError(f"unconverted data remains: {val[found.end():]}") # search else: @@ -155,8 +154,8 @@ def array_strptime(object[:] values, object fmt, if is_coerce: iresult[i] = NPY_NAT continue - raise ValueError("time data %r does not match format " - "%r (search)" % (val, fmt)) + raise ValueError(f"time data {repr(val)} does not match format " + f"{repr(fmt)} (search)") iso_year = -1 year = 1900 @@ -589,8 +588,8 @@ class TimeRE(dict): else: return '' regex = '|'.join(re.escape(stuff) for stuff in to_convert) - regex = '(?P<%s>%s' % (directive, regex) - return '%s)' % regex + regex = f'(?P<{directive}>{regex})' + return regex def pattern(self, format): """ @@ -609,11 +608,11 @@ class TimeRE(dict): format = whitespace_replacement.sub(r'\\s+', format) while '%' in format: directive_index = format.index('%') +1 - processed_format = "%s%s%s" % (processed_format, - format[:directive_index -1], - self[format[directive_index]]) + processed_format = (f"{processed_format}" + f"{format[:directive_index -1]}" + f"{self[format[directive_index]]}") format = format[directive_index +1:] - return "%s%s" % (processed_format, format) + return f"{processed_format}{format}" def compile(self, format): """Return a compiled re object for the format string.""" @@ -737,8 +736,7 @@ cdef parse_timezone_directive(str z): z = z[:3] + z[4:] if len(z) > 5: if z[5] != ':': - msg = "Inconsistent use of : in {0}" - raise ValueError(msg.format(z)) + raise ValueError(f"Inconsistent use of : in {z}") z = z[:5] + z[6:] hours = int(z[1:3]) minutes = int(z[3:5]) diff --git a/pandas/_libs/tslibs/timedeltas.pyx b/pandas/_libs/tslibs/timedeltas.pyx index 21dbdfbb111ed..8e5b719749857 100644 --- a/pandas/_libs/tslibs/timedeltas.pyx +++ b/pandas/_libs/tslibs/timedeltas.pyx @@ -170,7 +170,7 @@ cdef convert_to_timedelta64(object ts, object unit): if ts.astype('int64') == NPY_NAT: return np.timedelta64(NPY_NAT) elif is_timedelta64_object(ts): - ts = ts.astype("m8[{unit}]".format(unit=unit.lower())) + ts = ts.astype(f"m8[{unit.lower()}]") elif is_integer_object(ts): if ts == NPY_NAT: return np.timedelta64(NPY_NAT) @@ -198,8 +198,7 @@ cdef convert_to_timedelta64(object ts, object unit): if PyDelta_Check(ts): ts = np.timedelta64(delta_to_nanoseconds(ts), 'ns') elif not is_timedelta64_object(ts): - raise ValueError("Invalid type for timedelta " - "scalar: {ts_type}".format(ts_type=type(ts))) + raise ValueError(f"Invalid type for timedelta scalar: {type(ts)}") return ts.astype('timedelta64[ns]') @@ -288,7 +287,7 @@ cpdef inline object precision_from_unit(object unit): m = 1L p = 0 else: - raise ValueError("cannot cast unit {unit}".format(unit=unit)) + raise ValueError(f"cannot cast unit {unit}") return m, p @@ -397,8 +396,7 @@ cdef inline int64_t parse_timedelta_string(str ts) except? -1: result += timedelta_as_neg(r, neg) have_hhmmss = 1 else: - raise ValueError("expecting hh:mm:ss format, " - "received: {ts}".format(ts=ts)) + raise ValueError(f"expecting hh:mm:ss format, received: {ts}") unit, number = [], [] @@ -511,7 +509,7 @@ cdef inline timedelta_from_spec(object number, object frac, object unit): unit = 'm' unit = parse_timedelta_unit(unit) except KeyError: - raise ValueError("invalid abbreviation: {unit}".format(unit=unit)) + raise ValueError(f"invalid abbreviation: {unit}") n = ''.join(number) + '.' + ''.join(frac) return cast_from_unit(float(n), unit) @@ -530,8 +528,7 @@ cpdef inline object parse_timedelta_unit(object unit): try: return timedelta_abbrevs[unit.lower()] except (KeyError, AttributeError): - raise ValueError("invalid unit abbreviation: {unit}" - .format(unit=unit)) + raise ValueError(f"invalid unit abbreviation: {unit}") # ---------------------------------------------------------------------- # Timedelta ops utilities @@ -727,8 +724,7 @@ cdef _to_py_int_float(v): return int(v) elif is_float_object(v): return float(v) - raise TypeError("Invalid type {typ}. Must be int or " - "float.".format(typ=type(v))) + raise TypeError(f"Invalid type {type(v)}. Must be int or float.") # Similar to Timestamp/datetime, this is a construction requirement for @@ -773,10 +769,9 @@ cdef class _Timedelta(timedelta): elif op == Py_NE: return True # only allow ==, != ops - raise TypeError('Cannot compare type {cls} with ' - 'type {other}' - .format(cls=type(self).__name__, - other=type(other).__name__)) + raise TypeError(f'Cannot compare type ' + f'{type(self).__name__} with ' + f'type {type(other).__name__}') if util.is_array(other): return PyObject_RichCompare(np.array([self]), other, op) return PyObject_RichCompare(other, self, reverse_ops[op]) @@ -787,10 +782,8 @@ cdef class _Timedelta(timedelta): return False elif op == Py_NE: return True - raise TypeError('Cannot compare type {cls} with ' - 'type {other}' - .format(cls=type(self).__name__, - other=type(other).__name__)) + raise TypeError(f'Cannot compare type {type(self).__name__} with ' + f'type {type(other).__name__}') return cmp_scalar(self.value, ots.value, op) @@ -1143,7 +1136,8 @@ cdef class _Timedelta(timedelta): return fmt.format(**comp_dict) def __repr__(self) -> str: - return "Timedelta('{val}')".format(val=self._repr_base(format='long')) + repr_based = self._repr_base(format='long') + return f"Timedelta('{repr_based}')" def __str__(self) -> str: return self._repr_base(format='long') @@ -1189,14 +1183,14 @@ cdef class _Timedelta(timedelta): 'P500DT12H0MS' """ components = self.components - seconds = '{}.{:0>3}{:0>3}{:0>3}'.format(components.seconds, - components.milliseconds, - components.microseconds, - components.nanoseconds) + seconds = (f'{components.seconds}.' + f'{components.milliseconds:0>3}' + f'{components.microseconds:0>3}' + f'{components.nanoseconds:0>3}') # Trim unnecessary 0s, 1.000000000 -> 1 seconds = seconds.rstrip('0').rstrip('.') - tpl = ('P{td.days}DT{td.hours}H{td.minutes}M{seconds}S' - .format(td=components, seconds=seconds)) + tpl = (f'P{components.days}DT{components.hours}' + f'H{components.minutes}M{seconds}S') return tpl @@ -1276,7 +1270,7 @@ class Timedelta(_Timedelta): value = convert_to_timedelta64(value, 'ns') elif is_timedelta64_object(value): if unit is not None: - value = value.astype('timedelta64[{0}]'.format(unit)) + value = value.astype(f'timedelta64[{unit}]') value = value.astype('timedelta64[ns]') elif hasattr(value, 'delta'): value = np.timedelta64(delta_to_nanoseconds(value.delta), 'ns') @@ -1288,9 +1282,8 @@ class Timedelta(_Timedelta): return NaT else: raise ValueError( - "Value must be Timedelta, string, integer, " - "float, timedelta or convertible, not {typ}" - .format(typ=type(value).__name__)) + f"Value must be Timedelta, string, integer, " + f"float, timedelta or convertible, not {type(value).__name__}") if is_timedelta64_object(value): value = value.view('i8') @@ -1485,9 +1478,7 @@ class Timedelta(_Timedelta): else: return self.to_timedelta64() // other - raise TypeError('Invalid dtype {dtype} for ' - '{op}'.format(dtype=other.dtype, - op='__floordiv__')) + raise TypeError(f'Invalid dtype {other.dtype} for __floordiv__') elif is_integer_object(other) or is_float_object(other): return Timedelta(self.value // other, unit='ns') @@ -1530,9 +1521,7 @@ class Timedelta(_Timedelta): """) warnings.warn(msg, FutureWarning) return other // self.value - raise TypeError('Invalid dtype {dtype} for ' - '{op}'.format(dtype=other.dtype, - op='__floordiv__')) + raise TypeError(f'Invalid dtype {other.dtype} for __floordiv__') elif is_float_object(other) and util.is_nan(other): # i.e. np.nan @@ -1555,8 +1544,7 @@ class Timedelta(_Timedelta): if hasattr(other, 'dtype') and other.dtype.kind == 'i': # TODO: Remove this check with backwards-compat shim # for integer / Timedelta is removed. - raise TypeError("Invalid type {dtype} for " - "{op}".format(dtype=other.dtype, op='__mod__')) + raise TypeError(f'Invalid dtype {other.dtype} for __mod__') return self.__rdivmod__(other)[1] def __divmod__(self, other): @@ -1569,8 +1557,7 @@ class Timedelta(_Timedelta): if hasattr(other, 'dtype') and other.dtype.kind == 'i': # TODO: Remove this check with backwards-compat shim # for integer / Timedelta is removed. - raise TypeError("Invalid type {dtype} for " - "{op}".format(dtype=other.dtype, op='__mod__')) + raise TypeError(f'Invalid dtype {other.dtype} for __mod__') div = other // self return div, other - div * self diff --git a/pandas/_libs/tslibs/timestamps.pyx b/pandas/_libs/tslibs/timestamps.pyx index 03ed26337d539..1a278f46a4a2b 100644 --- a/pandas/_libs/tslibs/timestamps.pyx +++ b/pandas/_libs/tslibs/timestamps.pyx @@ -370,8 +370,8 @@ class Timestamp(_Timestamp): if tzinfo is not None: if not PyTZInfo_Check(tzinfo): # tzinfo must be a datetime.tzinfo object, GH#17690 - raise TypeError('tzinfo must be a datetime.tzinfo object, ' - 'not %s' % type(tzinfo)) + raise TypeError(f'tzinfo must be a datetime.tzinfo object, ' + f'not {type(tzinfo)}') elif tz is not None: raise ValueError('Can provide at most one of tz, tzinfo') @@ -946,8 +946,8 @@ default 'raise' def validate(k, v): """ validate integers """ if not is_integer_object(v): - raise ValueError("value must be an integer, received " - "{v} for {k}".format(v=type(v), k=k)) + raise ValueError(f"value must be an integer, received " + f"{type(v)} for {k}") return v if year is not None: @@ -1003,9 +1003,9 @@ default 'raise' base1, base2 = base, "" if self.microsecond != 0: - base1 += "%.3d" % self.nanosecond + base1 += f"{self.nanosecond:03d}" else: - base1 += ".%.9d" % self.nanosecond + base1 += f".{self.nanosecond:09d}" return base1 + base2 diff --git a/pandas/_libs/tslibs/timezones.pyx b/pandas/_libs/tslibs/timezones.pyx index bc1fdfae99de9..35ee87e714fa8 100644 --- a/pandas/_libs/tslibs/timezones.pyx +++ b/pandas/_libs/tslibs/timezones.pyx @@ -280,8 +280,8 @@ def infer_tzinfo(start, end): if start is not None and end is not None: tz = start.tzinfo if not tz_compare(tz, end.tzinfo): - msg = 'Inputs must both have the same timezone, {tz1} != {tz2}' - raise AssertionError(msg.format(tz1=tz, tz2=end.tzinfo)) + raise AssertionError(f'Inputs must both have the same timezone, ' + f'{tz} != {end.tzinfo}') elif start is not None: tz = start.tzinfo elif end is not None: diff --git a/pandas/_libs/tslibs/tzconversion.pyx b/pandas/_libs/tslibs/tzconversion.pyx index dd0c6fc75b06f..b368f0fde3edc 100644 --- a/pandas/_libs/tslibs/tzconversion.pyx +++ b/pandas/_libs/tslibs/tzconversion.pyx @@ -175,8 +175,8 @@ timedelta-like} if trans_idx.size == 1: stamp = _render_tstamp(vals[trans_idx]) raise pytz.AmbiguousTimeError( - "Cannot infer dst time from %s as there " - "are no repeated times".format(stamp)) + f"Cannot infer dst time from {stamp} as there " + f"are no repeated times") # Split the array into contiguous chunks (where the difference between # indices is 1). These are effectively dst transitions in different # years which is useful for checking that there is not an ambiguous @@ -200,8 +200,8 @@ timedelta-like} switch_idx = (delta <= 0).nonzero()[0] if switch_idx.size > 1: raise pytz.AmbiguousTimeError( - "There are %i dst switches when " - "there should only be 1.".format(switch_idx.size)) + f"There are {switch_idx.size} dst switches when " + f"there should only be 1.") switch_idx = switch_idx[0] + 1 # Pull the only index and adjust a_idx = grp[:switch_idx] @@ -230,8 +230,8 @@ timedelta-like} else: stamp = _render_tstamp(val) raise pytz.AmbiguousTimeError( - "Cannot infer dst time from %r, try using the " - "'ambiguous' argument".format(stamp)) + f"Cannot infer dst time from {stamp}, try using the " + f"'ambiguous' argument") elif left != NPY_NAT: result[i] = left elif right != NPY_NAT: @@ -246,8 +246,8 @@ timedelta-like} # time if -1 < shift_delta + remaining_mins < HOURS_NS: raise ValueError( - "The provided timedelta will relocalize on a " - "nonexistent time: {}".format(nonexistent) + f"The provided timedelta will relocalize on a " + f"nonexistent time: {nonexistent}" ) new_local = val + shift_delta elif shift_forward: From 62c3ff5a542f101b4a98a03a9664b7e2c0128bc5 Mon Sep 17 00:00:00 2001 From: William Ayd Date: Fri, 22 Nov 2019 07:33:17 -0800 Subject: [PATCH 167/185] Reenabled no-unused-function (#29767) --- pandas/_libs/parsers.pyx | 53 ----------------------- pandas/_libs/src/ujson/python/objToJSON.c | 16 ------- pandas/_libs/tslibs/nattype.pyx | 4 -- setup.py | 2 +- 4 files changed, 1 insertion(+), 74 deletions(-) diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx index bbea66542a953..8f0f4e17df2f9 100644 --- a/pandas/_libs/parsers.pyx +++ b/pandas/_libs/parsers.pyx @@ -1406,59 +1406,6 @@ cdef inline StringPath _string_path(char *encoding): # Type conversions / inference support code -cdef _string_box_factorize(parser_t *parser, int64_t col, - int64_t line_start, int64_t line_end, - bint na_filter, kh_str_starts_t *na_hashset): - cdef: - int error, na_count = 0 - Py_ssize_t i, lines - coliter_t it - const char *word = NULL - ndarray[object] result - - int ret = 0 - kh_strbox_t *table - - object pyval - - object NA = na_values[np.object_] - khiter_t k - - table = kh_init_strbox() - lines = line_end - line_start - result = np.empty(lines, dtype=np.object_) - coliter_setup(&it, parser, col, line_start) - - for i in range(lines): - COLITER_NEXT(it, word) - - if na_filter: - if kh_get_str_starts_item(na_hashset, word): - # in the hash table - na_count += 1 - result[i] = NA - continue - - k = kh_get_strbox(table, word) - - # in the hash table - if k != table.n_buckets: - # this increments the refcount, but need to test - pyval = table.vals[k] - else: - # box it. new ref? - pyval = PyBytes_FromString(word) - - k = kh_put_strbox(table, word, &ret) - table.vals[k] = pyval - - result[i] = pyval - - kh_destroy_strbox(table) - - return result, na_count - - cdef _string_box_utf8(parser_t *parser, int64_t col, int64_t line_start, int64_t line_end, bint na_filter, kh_str_starts_t *na_hashset): diff --git a/pandas/_libs/src/ujson/python/objToJSON.c b/pandas/_libs/src/ujson/python/objToJSON.c index 48712dc68829d..21f439ec93e0f 100644 --- a/pandas/_libs/src/ujson/python/objToJSON.c +++ b/pandas/_libs/src/ujson/python/objToJSON.c @@ -399,22 +399,6 @@ static void *CLong(JSOBJ obj, JSONTypeContext *tc, void *outValue, return NULL; } -#ifdef _LP64 -static void *PyIntToINT64(JSOBJ _obj, JSONTypeContext *tc, void *outValue, - size_t *_outLen) { - PyObject *obj = (PyObject *)_obj; - *((JSINT64 *)outValue) = PyLong_AsLong(obj); - return NULL; -} -#else -static void *PyIntToINT32(JSOBJ _obj, JSONTypeContext *tc, void *outValue, - size_t *_outLen) { - PyObject *obj = (PyObject *)_obj; - *((JSINT32 *)outValue) = PyLong_AsLong(obj); - return NULL; -} -#endif - static void *PyLongToINT64(JSOBJ _obj, JSONTypeContext *tc, void *outValue, size_t *_outLen) { *((JSINT64 *)outValue) = GET_TC(tc)->longValue; diff --git a/pandas/_libs/tslibs/nattype.pyx b/pandas/_libs/tslibs/nattype.pyx index 6e5e62e77b4b1..6fab827f1364a 100644 --- a/pandas/_libs/tslibs/nattype.pyx +++ b/pandas/_libs/tslibs/nattype.pyx @@ -95,10 +95,6 @@ cdef class _NaT(datetime): # higher than np.ndarray and np.matrix __array_priority__ = 100 - def __hash__(_NaT self): - # py3k needs this defined here - return hash(self.value) - def __richcmp__(_NaT self, object other, int op): cdef: int ndim = getattr(other, 'ndim', -1) diff --git a/setup.py b/setup.py index 0915b6aba113a..cfcef4f9fa075 100755 --- a/setup.py +++ b/setup.py @@ -463,7 +463,7 @@ def run(self): extra_link_args.append("/DEBUG") else: # args to ignore warnings - extra_compile_args = ["-Wno-unused-function"] + extra_compile_args = [] extra_link_args = [] if debugging_symbols_requested: extra_compile_args.append("-g") From f93e4df02588e4ae1c2d338cfeedaee0a88fac4b Mon Sep 17 00:00:00 2001 From: francisco souza Date: Fri, 22 Nov 2019 10:35:21 -0500 Subject: [PATCH 168/185] io/parsers: ensure decimal is str on PythonParser (#29743) --- doc/source/whatsnew/v1.0.0.rst | 1 + pandas/io/parsers.py | 4 ++-- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index b015f439935cb..e302e209b56a1 100644 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -489,6 +489,7 @@ I/O - Bug in :meth:`Styler.background_gradient` not able to work with dtype ``Int64`` (:issue:`28869`) - Bug in :meth:`DataFrame.to_clipboard` which did not work reliably in ipython (:issue:`22707`) - Bug in :func:`read_json` where default encoding was not set to ``utf-8`` (:issue:`29565`) +- Bug in :class:`PythonParser` where str and bytes were being mixed when dealing with the decimal field (:issue:`29650`) - Plotting diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index cf1511c1221b3..bbec148b8745d 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -488,7 +488,7 @@ def _read(filepath_or_buffer: FilePathOrBuffer, kwds): "cache_dates": True, "thousands": None, "comment": None, - "decimal": b".", + "decimal": ".", # 'engine': 'c', "parse_dates": False, "keep_date_col": False, @@ -568,7 +568,7 @@ def parser_f( # Quoting, Compression, and File Format compression="infer", thousands=None, - decimal=b".", + decimal: str = ".", lineterminator=None, quotechar='"', quoting=csv.QUOTE_MINIMAL, From d56c2dc5797475f5d0393098c7ef4b3bd2610a7f Mon Sep 17 00:00:00 2001 From: cruzzoe Date: Fri, 22 Nov 2019 16:19:26 +0000 Subject: [PATCH 169/185] Add documentation linking to sqlalchemy (#29373) --- pandas/core/generic.py | 6 +++++- pandas/io/sql.py | 4 +++- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index b16a72f01c739..2e2ae4e1dfa0a 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -2596,7 +2596,11 @@ def to_sql( Name of SQL table. con : sqlalchemy.engine.Engine or sqlite3.Connection Using SQLAlchemy makes it possible to use any DB supported by that - library. Legacy support is provided for sqlite3.Connection objects. + library. Legacy support is provided for sqlite3.Connection objects. The user + is responsible for engine disposal and connection closure for the SQLAlchemy + connectable See `here \ + `_ + schema : str, optional Specify the schema (if database flavor supports this). If None, use default schema. diff --git a/pandas/io/sql.py b/pandas/io/sql.py index e90e19649f645..684e602f06d12 100644 --- a/pandas/io/sql.py +++ b/pandas/io/sql.py @@ -361,7 +361,9 @@ def read_sql( or DBAPI2 connection (fallback mode) Using SQLAlchemy makes it possible to use any DB supported by that - library. If a DBAPI2 object, only sqlite3 is supported. + library. If a DBAPI2 object, only sqlite3 is supported. The user is responsible + for engine disposal and connection closure for the SQLAlchemy connectable. See + `here `_ index_col : string or list of strings, optional, default: None Column(s) to set as index(MultiIndex). coerce_float : boolean, default True From 768b27f319b6cc2ae628581d10fb3b404134469a Mon Sep 17 00:00:00 2001 From: ganevgv Date: Fri, 22 Nov 2019 17:02:08 +0000 Subject: [PATCH 170/185] add test for unused level raises KeyError (#29760) --- pandas/tests/test_multilevel.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/pandas/tests/test_multilevel.py b/pandas/tests/test_multilevel.py index f0928820367e9..44829423be1bb 100644 --- a/pandas/tests/test_multilevel.py +++ b/pandas/tests/test_multilevel.py @@ -583,6 +583,17 @@ def test_stack_unstack_wrong_level_name(self, method): with pytest.raises(KeyError, match="does not match index name"): getattr(s, method)("mistake") + def test_unused_level_raises(self): + # GH 20410 + mi = MultiIndex( + levels=[["a_lot", "onlyone", "notevenone"], [1970, ""]], + codes=[[1, 0], [1, 0]], + ) + df = DataFrame(-1, index=range(3), columns=mi) + + with pytest.raises(KeyError, match="notevenone"): + df["notevenone"] + def test_unstack_level_name(self): result = self.frame.unstack("second") expected = self.frame.unstack(level=1) From 816f3dff798a3225796157d65c04dd1ceb77c5fb Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Fri, 22 Nov 2019 09:28:55 -0800 Subject: [PATCH 171/185] DEPR: enforce deprecations in core.internals (#29723) --- ci/deps/azure-windows-36.yaml | 2 +- ci/deps/travis-36-cov.yaml | 2 +- ci/deps/travis-36-locale.yaml | 2 +- doc/source/getting_started/install.rst | 2 +- doc/source/whatsnew/v1.0.0.rst | 67 ++++++++++++++++++++++++ environment.yml | 2 +- pandas/compat/_optional.py | 2 +- pandas/core/internals/blocks.py | 20 ++----- pandas/tests/internals/test_internals.py | 13 ----- pandas/tests/io/test_parquet.py | 2 +- requirements-dev.txt | 2 +- 11 files changed, 78 insertions(+), 38 deletions(-) diff --git a/ci/deps/azure-windows-36.yaml b/ci/deps/azure-windows-36.yaml index 3aa261a57f2d4..aa3962da9b4f0 100644 --- a/ci/deps/azure-windows-36.yaml +++ b/ci/deps/azure-windows-36.yaml @@ -15,7 +15,7 @@ dependencies: # pandas dependencies - blosc - bottleneck - - fastparquet>=0.2.1 + - fastparquet>=0.3.2 - matplotlib=3.0.2 - numexpr - numpy=1.15.* diff --git a/ci/deps/travis-36-cov.yaml b/ci/deps/travis-36-cov.yaml index 170edd90ea3d7..ddc1ea41a08a3 100644 --- a/ci/deps/travis-36-cov.yaml +++ b/ci/deps/travis-36-cov.yaml @@ -17,7 +17,7 @@ dependencies: - botocore>=1.11 - cython>=0.29.13 - dask - - fastparquet>=0.2.1 + - fastparquet>=0.3.2 - gcsfs - geopandas - html5lib diff --git a/ci/deps/travis-36-locale.yaml b/ci/deps/travis-36-locale.yaml index 5dc1e4524ec86..d0bc046575953 100644 --- a/ci/deps/travis-36-locale.yaml +++ b/ci/deps/travis-36-locale.yaml @@ -15,7 +15,7 @@ dependencies: - beautifulsoup4 - blosc=1.14.3 - python-blosc - - fastparquet=0.2.1 + - fastparquet=0.3.2 - gcsfs=0.2.2 - html5lib - ipython diff --git a/doc/source/getting_started/install.rst b/doc/source/getting_started/install.rst index 3ba3a2fd4be1b..04df37427e4f5 100644 --- a/doc/source/getting_started/install.rst +++ b/doc/source/getting_started/install.rst @@ -250,7 +250,7 @@ SQLAlchemy 1.1.4 SQL support for databases other tha SciPy 0.19.0 Miscellaneous statistical functions XLsxWriter 0.9.8 Excel writing blosc Compression for msgpack -fastparquet 0.2.1 Parquet reading / writing +fastparquet 0.3.2 Parquet reading / writing gcsfs 0.2.2 Google Cloud Storage access html5lib HTML parser for read_html (see :ref:`note `) lxml 3.8.0 HTML parser for read_html (see :ref:`note `) diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index e302e209b56a1..ac440c263088b 100644 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -235,6 +235,71 @@ The following methods now also correctly output values for unobserved categories df.groupby(["cat_1", "cat_2"], observed=False)["value"].count() +.. _whatsnew_1000.api_breaking.deps: + +Increased minimum versions for dependencies +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Some minimum supported versions of dependencies were updated (:issue:`29723`). +If installed, we now require: + ++-----------------+-----------------+----------+ +| Package | Minimum Version | Required | ++=================+=================+==========+ +| numpy | 1.13.3 | X | ++-----------------+-----------------+----------+ +| pytz | 2015.4 | X | ++-----------------+-----------------+----------+ +| python-dateutil | 2.6.1 | X | ++-----------------+-----------------+----------+ +| bottleneck | 1.2.1 | | ++-----------------+-----------------+----------+ +| numexpr | 2.6.2 | | ++-----------------+-----------------+----------+ +| pytest (dev) | 4.0.2 | | ++-----------------+-----------------+----------+ + +For `optional libraries `_ the general recommendation is to use the latest version. +The following table lists the lowest version per library that is currently being tested throughout the development of pandas. +Optional libraries below the lowest tested version may still work, but are not considered supported. + ++-----------------+-----------------+ +| Package | Minimum Version | ++=================+=================+ +| beautifulsoup4 | 4.6.0 | ++-----------------+-----------------+ +| fastparquet | 0.3.2 | ++-----------------+-----------------+ +| gcsfs | 0.2.2 | ++-----------------+-----------------+ +| lxml | 3.8.0 | ++-----------------+-----------------+ +| matplotlib | 2.2.2 | ++-----------------+-----------------+ +| openpyxl | 2.4.8 | ++-----------------+-----------------+ +| pyarrow | 0.9.0 | ++-----------------+-----------------+ +| pymysql | 0.7.1 | ++-----------------+-----------------+ +| pytables | 3.4.2 | ++-----------------+-----------------+ +| scipy | 0.19.0 | ++-----------------+-----------------+ +| sqlalchemy | 1.1.4 | ++-----------------+-----------------+ +| xarray | 0.8.2 | ++-----------------+-----------------+ +| xlrd | 1.1.0 | ++-----------------+-----------------+ +| xlsxwriter | 0.9.8 | ++-----------------+-----------------+ +| xlwt | 1.2.0 | ++-----------------+-----------------+ + +See :ref:`install.dependencies` and :ref:`install.optional_dependencies` for more. + + .. _whatsnew_1000.api.other: Other API changes @@ -321,6 +386,8 @@ or ``matplotlib.Axes.plot``. See :ref:`plotting.formatters` for more. - Removed :meth:`DataFrame.as_blocks`, :meth:`Series.as_blocks`, `DataFrame.blocks`, :meth:`Series.blocks` (:issue:`17656`) - :meth:`pandas.Series.str.cat` now defaults to aligning ``others``, using ``join='left'`` (:issue:`27611`) - :meth:`pandas.Series.str.cat` does not accept list-likes *within* list-likes anymore (:issue:`27611`) +- :func:`core.internals.blocks.make_block` no longer accepts the "fastpath" keyword(:issue:`19265`) +- :meth:`Block.make_block_same_class` no longer accepts the "dtype" keyword(:issue:`19434`) - Removed the previously deprecated :meth:`ExtensionArray._formatting_values`. Use :attr:`ExtensionArray._formatter` instead. (:issue:`23601`) - Removed support for legacy HDF5 formats (:issue:`29787`) - :func:`read_excel` removed support for "skip_footer" argument, use "skipfooter" instead (:issue:`18836`) diff --git a/environment.yml b/environment.yml index 54c99f415165d..848825c37a160 100644 --- a/environment.yml +++ b/environment.yml @@ -75,7 +75,7 @@ dependencies: # optional for io - beautifulsoup4>=4.6.0 # pandas.read_html - - fastparquet>=0.2.1 # pandas.read_parquet, DataFrame.to_parquet + - fastparquet>=0.3.2 # pandas.read_parquet, DataFrame.to_parquet - html5lib # pandas.read_html - lxml # pandas.read_html - openpyxl # pandas.read_excel, DataFrame.to_excel, pandas.ExcelWriter, pandas.ExcelFile diff --git a/pandas/compat/_optional.py b/pandas/compat/_optional.py index ce9079ce8864d..bfe31c6a1d794 100644 --- a/pandas/compat/_optional.py +++ b/pandas/compat/_optional.py @@ -8,7 +8,7 @@ VERSIONS = { "bs4": "4.6.0", "bottleneck": "1.2.1", - "fastparquet": "0.2.1", + "fastparquet": "0.3.2", "gcsfs": "0.2.2", "lxml.etree": "3.8.0", "matplotlib": "2.2.2", diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 5edb4d93e068a..2d6ffb7277742 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -251,21 +251,13 @@ def make_block(self, values, placement=None): return make_block(values, placement=placement, ndim=self.ndim) - def make_block_same_class(self, values, placement=None, ndim=None, dtype=None): + def make_block_same_class(self, values, placement=None, ndim=None): """ Wrap given values in a block of same type as self. """ - if dtype is not None: - # issue 19431 fastparquet is passing this - warnings.warn( - "dtype argument is deprecated, will be removed in a future release.", - FutureWarning, - ) if placement is None: placement = self.mgr_locs if ndim is None: ndim = self.ndim - return make_block( - values, placement=placement, ndim=ndim, klass=self.__class__, dtype=dtype - ) + return make_block(values, placement=placement, ndim=ndim, klass=self.__class__) def __repr__(self) -> str: # don't want to print out all of the items here @@ -3001,7 +2993,7 @@ def get_block_type(values, dtype=None): return cls -def make_block(values, placement, klass=None, ndim=None, dtype=None, fastpath=None): +def make_block(values, placement, klass=None, ndim=None, dtype=None): # Ensure that we don't allow PandasArray / PandasDtype in internals. # For now, blocks should be backed by ndarrays when possible. if isinstance(values, ABCPandasArray): @@ -3012,12 +3004,6 @@ def make_block(values, placement, klass=None, ndim=None, dtype=None, fastpath=No if isinstance(dtype, PandasDtype): dtype = dtype.numpy_dtype - if fastpath is not None: - # GH#19265 pyarrow is passing this - warnings.warn( - "fastpath argument is deprecated, will be removed in a future release.", - FutureWarning, - ) if klass is None: dtype = dtype or values.dtype klass = get_block_type(values, dtype) diff --git a/pandas/tests/internals/test_internals.py b/pandas/tests/internals/test_internals.py index c98bdab0df766..abe2ddf955ad8 100644 --- a/pandas/tests/internals/test_internals.py +++ b/pandas/tests/internals/test_internals.py @@ -308,12 +308,6 @@ def test_delete(self): with pytest.raises(Exception): newb.delete(3) - def test_make_block_same_class(self): - # issue 19431 - block = create_block("M8[ns, US/Eastern]", [3]) - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - block.make_block_same_class(block.values, dtype=block.values.dtype) - class TestDatetimeBlock: def test_can_hold_element(self): @@ -1255,13 +1249,6 @@ def test_holder(typestr, holder): assert blk._holder is holder -def test_deprecated_fastpath(): - # GH#19265 - values = np.random.rand(3, 3) - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - make_block(values, placement=np.arange(3), fastpath=True) - - def test_validate_ndim(): values = np.array([1.0, 2.0]) placement = slice(2) diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py index bcbbee3b86769..3e687d185df84 100644 --- a/pandas/tests/io/test_parquet.py +++ b/pandas/tests/io/test_parquet.py @@ -531,7 +531,7 @@ def test_additional_extension_arrays(self, pa): class TestParquetFastParquet(Base): - @td.skip_if_no("fastparquet", min_version="0.2.1") + @td.skip_if_no("fastparquet", min_version="0.3.2") def test_basic(self, fp, df_full): df = df_full diff --git a/requirements-dev.txt b/requirements-dev.txt index 87b348c39a17b..4d0e7ee904294 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -48,7 +48,7 @@ matplotlib>=2.2.2 numexpr>=2.6.8 scipy>=1.1 beautifulsoup4>=4.6.0 -fastparquet>=0.2.1 +fastparquet>=0.3.2 html5lib lxml openpyxl From 6332b1e86a9dd83764168cc408902f17bdbb97e9 Mon Sep 17 00:00:00 2001 From: Terji Petersen Date: Fri, 22 Nov 2019 19:40:59 +0000 Subject: [PATCH 172/185] minor cleanups (#29798) --- doc/source/whatsnew/v1.0.0.rst | 2 +- pandas/core/arrays/categorical.py | 10 ++-------- 2 files changed, 3 insertions(+), 9 deletions(-) diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index ac440c263088b..2b68ddf3d8918 100644 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -426,7 +426,7 @@ Performance improvements - Performance improvement in :meth:`DataFrame.replace` when provided a list of values to replace (:issue:`28099`) - Performance improvement in :meth:`DataFrame.select_dtypes` by using vectorization instead of iterating over a loop (:issue:`28317`) - Performance improvement in :meth:`Categorical.searchsorted` and :meth:`CategoricalIndex.searchsorted` (:issue:`28795`) -- Performance improvement when searching for a scalar in a :meth:`Categorical` and the scalar is not found in the categories (:issue:`29750`) +- Performance improvement when comparing a :meth:`Categorical` with a scalar and the scalar is not found in the categories (:issue:`29750`) .. _whatsnew_1000.bug_fixes: diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 85688a394ebda..ca9ec2fd63165 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -77,12 +77,6 @@ def _cat_compare_op(op): @unpack_zerodim_and_defer(opname) def func(self, other): - # On python2, you can usually compare any type to any type, and - # Categoricals can be seen as a custom type, but having different - # results depending whether categories are the same or not is kind of - # insane, so be a bit stricter here and use the python3 idea of - # comparing only things of equal type. - if is_list_like(other) and len(other) != len(self): # TODO: Could this fail if the categories are listlike objects? raise ValueError("Lengths must match.") @@ -840,8 +834,8 @@ def set_categories(self, new_categories, ordered=None, rename=False, inplace=Fal On the other hand this methods does not do checks (e.g., whether the old categories are included in the new categories on a reorder), which can result in surprising changes, for example when using special string - dtypes on python3, which does not considers a S1 string equal to a - single char python string. + dtypes, which does not considers a S1 string equal to a single char + python string. Parameters ---------- From d55258f3ccb05a44869e005da302e7cba2d50cc8 Mon Sep 17 00:00:00 2001 From: MomIsBestFriend <50263213+MomIsBestFriend@users.noreply.github.com> Date: Sat, 23 Nov 2019 19:15:12 +0200 Subject: [PATCH 173/185] Fixed small mistake (#29815) --- pandas/_libs/tslibs/strptime.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/_libs/tslibs/strptime.pyx b/pandas/_libs/tslibs/strptime.pyx index 71e5006761097..fda508e51e48f 100644 --- a/pandas/_libs/tslibs/strptime.pyx +++ b/pandas/_libs/tslibs/strptime.pyx @@ -110,7 +110,7 @@ def array_strptime(object[:] values, object fmt, f"in format '{fmt}'") # IndexError only occurs when the format string is "%" except IndexError: - raise ValueError("stray % in format '{fmt}'") + raise ValueError(f"stray % in format '{fmt}'") _regex_cache[fmt] = format_regex result = np.empty(n, dtype='M8[ns]') From 3f69d6277c4961631b606a62cad117daa6b3c052 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke Date: Sat, 23 Nov 2019 13:57:23 -0800 Subject: [PATCH 174/185] REF: Create _lib/window directory (#29817) --- pandas/_libs/window/__init__.py | 0 .../{window.pyx => window/aggregations.pyx} | 0 .../indexers.pyx} | 35 +++---------------- pandas/core/window/ewm.py | 12 +++---- pandas/core/window/rolling.py | 12 +++---- setup.py | 12 ++++--- 6 files changed, 25 insertions(+), 46 deletions(-) create mode 100644 pandas/_libs/window/__init__.py rename pandas/_libs/{window.pyx => window/aggregations.pyx} (100%) rename pandas/_libs/{window_indexer.pyx => window/indexers.pyx} (83%) diff --git a/pandas/_libs/window/__init__.py b/pandas/_libs/window/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/pandas/_libs/window.pyx b/pandas/_libs/window/aggregations.pyx similarity index 100% rename from pandas/_libs/window.pyx rename to pandas/_libs/window/aggregations.pyx diff --git a/pandas/_libs/window_indexer.pyx b/pandas/_libs/window/indexers.pyx similarity index 83% rename from pandas/_libs/window_indexer.pyx rename to pandas/_libs/window/indexers.pyx index 8f49a8b9462d3..eab9f0f8aab43 100644 --- a/pandas/_libs/window_indexer.pyx +++ b/pandas/_libs/window/indexers.pyx @@ -1,5 +1,7 @@ # cython: boundscheck=False, wraparound=False, cdivision=True +from typing import Tuple + import numpy as np from numpy cimport ndarray, int64_t @@ -8,33 +10,6 @@ from numpy cimport ndarray, int64_t # These define start/end indexers to compute offsets -class MockFixedWindowIndexer: - """ - - We are just checking parameters of the indexer, - and returning a consistent API with fixed/variable - indexers. - - Parameters - ---------- - values: ndarray - values data array - win: int64_t - window size - index: object - index of the values - closed: string - closed behavior - """ - def __init__(self, ndarray values, int64_t win, object closed, object index=None): - - self.start = np.empty(0, dtype='int64') - self.end = np.empty(0, dtype='int64') - - def get_window_bounds(self): - return self.start, self.end - - class FixedWindowIndexer: """ create a fixed length window indexer object @@ -66,7 +41,7 @@ class FixedWindowIndexer: end_e = start_e + win self.end = np.concatenate([end_s, end_e])[:N] - def get_window_bounds(self): + def get_window_bounds(self) -> Tuple[np.ndarray, np.ndarray]: return self.start, self.end @@ -108,7 +83,7 @@ class VariableWindowIndexer: @staticmethod def build(const int64_t[:] index, int64_t win, bint left_closed, - bint right_closed, int64_t N): + bint right_closed, int64_t N) -> Tuple[np.ndarray, np.ndarray]: cdef: ndarray[int64_t] start, end @@ -161,5 +136,5 @@ class VariableWindowIndexer: end[i] -= 1 return start, end - def get_window_bounds(self): + def get_window_bounds(self) -> Tuple[np.ndarray, np.ndarray]: return self.start, self.end diff --git a/pandas/core/window/ewm.py b/pandas/core/window/ewm.py index 89c25c07b0dbf..c9837afd96356 100644 --- a/pandas/core/window/ewm.py +++ b/pandas/core/window/ewm.py @@ -2,7 +2,7 @@ import numpy as np -import pandas._libs.window as libwindow +import pandas._libs.window.aggregations as window_aggregations from pandas.compat.numpy import function as nv from pandas.util._decorators import Appender, Substitution @@ -228,11 +228,11 @@ def _apply(self, func, **kwargs): # if we have a string function name, wrap it if isinstance(func, str): - cfunc = getattr(libwindow, func, None) + cfunc = getattr(window_aggregations, func, None) if cfunc is None: raise ValueError( "we do not support this function " - "in libwindow.{func}".format(func=func) + "in window_aggregations.{func}".format(func=func) ) def func(arg): @@ -284,7 +284,7 @@ def var(self, bias=False, *args, **kwargs): nv.validate_window_func("var", args, kwargs) def f(arg): - return libwindow.ewmcov( + return window_aggregations.ewmcov( arg, arg, self.com, @@ -328,7 +328,7 @@ def cov(self, other=None, pairwise=None, bias=False, **kwargs): def _get_cov(X, Y): X = self._shallow_copy(X) Y = self._shallow_copy(Y) - cov = libwindow.ewmcov( + cov = window_aggregations.ewmcov( X._prep_values(), Y._prep_values(), self.com, @@ -375,7 +375,7 @@ def _get_corr(X, Y): Y = self._shallow_copy(Y) def _cov(x, y): - return libwindow.ewmcov( + return window_aggregations.ewmcov( x, y, self.com, diff --git a/pandas/core/window/rolling.py b/pandas/core/window/rolling.py index 6a35664ece765..2f37ba9b8f725 100644 --- a/pandas/core/window/rolling.py +++ b/pandas/core/window/rolling.py @@ -10,8 +10,8 @@ import numpy as np -import pandas._libs.window as libwindow -import pandas._libs.window_indexer as libwindow_indexer +import pandas._libs.window.aggregations as window_aggregations +import pandas._libs.window.indexers as window_indexers from pandas.compat._optional import import_optional_dependency from pandas.compat.numpy import function as nv from pandas.util._decorators import Appender, Substitution, cache_readonly @@ -381,11 +381,11 @@ def _get_roll_func(self, func_name: str) -> Callable: ------- func : callable """ - window_func = getattr(libwindow, func_name, None) + window_func = getattr(window_aggregations, func_name, None) if window_func is None: raise ValueError( "we do not support this function " - "in libwindow.{func_name}".format(func_name=func_name) + "in window_aggregations.{func_name}".format(func_name=func_name) ) return window_func @@ -406,8 +406,8 @@ def _get_window_indexer(self): Return an indexer class that will compute the window start and end bounds """ if self.is_freq_type: - return libwindow_indexer.VariableWindowIndexer - return libwindow_indexer.FixedWindowIndexer + return window_indexers.VariableWindowIndexer + return window_indexers.FixedWindowIndexer def _apply( self, diff --git a/setup.py b/setup.py index cfcef4f9fa075..e6a95d4e7afd8 100755 --- a/setup.py +++ b/setup.py @@ -344,13 +344,13 @@ class CheckSDist(sdist_class): "pandas/_libs/tslibs/resolution.pyx", "pandas/_libs/tslibs/parsing.pyx", "pandas/_libs/tslibs/tzconversion.pyx", - "pandas/_libs/window_indexer.pyx", + "pandas/_libs/window/indexers.pyx", "pandas/_libs/writers.pyx", "pandas/io/sas/sas.pyx", ] _cpp_pyxfiles = [ - "pandas/_libs/window.pyx", + "pandas/_libs/window/aggregations.pyx", "pandas/io/msgpack/_packer.pyx", "pandas/io/msgpack/_unpacker.pyx", ] @@ -683,8 +683,12 @@ def srcpath(name=None, suffix=".pyx", subdir="src"): "sources": np_datetime_sources, }, "_libs.testing": {"pyxfile": "_libs/testing"}, - "_libs.window": {"pyxfile": "_libs/window", "language": "c++", "suffix": ".cpp"}, - "_libs.window_indexer": {"pyxfile": "_libs/window_indexer"}, + "_libs.window.aggregations": { + "pyxfile": "_libs/window/aggregations", + "language": "c++", + "suffix": ".cpp" + }, + "_libs.window.indexers": {"pyxfile": "_libs/window/indexers"}, "_libs.writers": {"pyxfile": "_libs/writers"}, "io.sas._sas": {"pyxfile": "io/sas/sas"}, "io.msgpack._packer": { From e0bd4d5dd07cc481cb52de3cf3c7bf199cb2df07 Mon Sep 17 00:00:00 2001 From: Mabel Villalba Date: Sat, 23 Nov 2019 23:43:52 +0100 Subject: [PATCH 175/185] BUG: pivot_table not returning correct type when margin=True and aggfunc='mean' (#28248) --- doc/source/whatsnew/v1.0.0.rst | 2 ++ pandas/core/reshape/pivot.py | 5 ++++- pandas/tests/reshape/merge/test_pivot_old.py | 0 pandas/tests/reshape/test_pivot.py | 18 ++++++++++++++++++ 4 files changed, 24 insertions(+), 1 deletion(-) create mode 100644 pandas/tests/reshape/merge/test_pivot_old.py diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index 2b68ddf3d8918..28a61c535f951 100644 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -595,6 +595,7 @@ Reshaping - Bug in :meth:`DataFrame.apply` that caused incorrect output with empty :class:`DataFrame` (:issue:`28202`, :issue:`21959`) - Bug in :meth:`DataFrame.stack` not handling non-unique indexes correctly when creating MultiIndex (:issue: `28301`) +- Bug in :meth:`pivot_table` not returning correct type ``float`` when ``margins=True`` and ``aggfunc='mean'`` (:issue:`24893`) - Bug :func:`merge_asof` could not use :class:`datetime.timedelta` for ``tolerance`` kwarg (:issue:`28098`) - Bug in :func:`merge`, did not append suffixes correctly with MultiIndex (:issue:`28518`) - :func:`qcut` and :func:`cut` now handle boolean input (:issue:`20303`) @@ -604,6 +605,7 @@ Reshaping - Bug where :meth:`DataFrame.equals` returned True incorrectly in some cases when two DataFrames had the same columns in different orders (:issue:`28839`) - Bug in :meth:`DataFrame.replace` that caused non-numeric replacer's dtype not respected (:issue:`26632`) + Sparse ^^^^^^ - Bug in :class:`SparseDataFrame` arithmetic operations incorrectly casting inputs to float (:issue:`28107`) diff --git a/pandas/core/reshape/pivot.py b/pandas/core/reshape/pivot.py index c7d3adece521e..27d6a28a33cc6 100644 --- a/pandas/core/reshape/pivot.py +++ b/pandas/core/reshape/pivot.py @@ -261,9 +261,12 @@ def _add_margins( row_names = result.index.names try: + # check the result column and leave floats for dtype in set(result.dtypes): cols = result.select_dtypes([dtype]).columns - margin_dummy[cols] = margin_dummy[cols].astype(dtype) + margin_dummy[cols] = margin_dummy[cols].apply( + maybe_downcast_to_dtype, args=(dtype,) + ) result = result.append(margin_dummy) except TypeError: diff --git a/pandas/tests/reshape/merge/test_pivot_old.py b/pandas/tests/reshape/merge/test_pivot_old.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/pandas/tests/reshape/test_pivot.py b/pandas/tests/reshape/test_pivot.py index 5b6dc70894857..bd1d3d2d5bb63 100644 --- a/pandas/tests/reshape/test_pivot.py +++ b/pandas/tests/reshape/test_pivot.py @@ -1656,6 +1656,24 @@ def test_categorical_margins_category(self, observed): table = df.pivot_table("x", "y", "z", dropna=observed, margins=True) tm.assert_frame_equal(table, expected) + def test_margins_casted_to_float(self, observed): + # GH 24893 + df = pd.DataFrame( + { + "A": [2, 4, 6, 8], + "B": [1, 4, 5, 8], + "C": [1, 3, 4, 6], + "D": ["X", "X", "Y", "Y"], + } + ) + + result = pd.pivot_table(df, index="D", margins=True) + expected = pd.DataFrame( + {"A": [3, 7, 5], "B": [2.5, 6.5, 4.5], "C": [2, 5, 3.5]}, + index=pd.Index(["X", "Y", "All"], name="D"), + ) + tm.assert_frame_equal(result, expected) + def test_pivot_with_categorical(self, observed, ordered_fixture): # gh-21370 idx = [np.nan, "low", "high", "low", np.nan] From ab0c5822eb3a4717003d0597307284dcd7c4a887 Mon Sep 17 00:00:00 2001 From: alexander135 <31936366+alexander135@users.noreply.github.com> Date: Sun, 24 Nov 2019 02:02:52 +0300 Subject: [PATCH 176/185] Changed description of parse_dates in read_excel(). (#29796) --- pandas/io/excel/_base.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/pandas/io/excel/_base.py b/pandas/io/excel/_base.py index e615507b4199d..c442f0d9bf66c 100644 --- a/pandas/io/excel/_base.py +++ b/pandas/io/excel/_base.py @@ -165,8 +165,9 @@ result 'foo' If a column or index contains an unparseable date, the entire column or - index will be returned unaltered as an object data type. For non-standard - datetime parsing, use ``pd.to_datetime`` after ``pd.read_excel``. + index will be returned unaltered as an object data type. If you don`t want to + parse some cells as date just change their type in Excel to "Text". + For non-standard datetime parsing, use ``pd.to_datetime`` after ``pd.read_excel``. Note: A fast-path exists for iso8601-formatted dates. date_parser : function, optional From 6f1accd04ff6957b24648ee947dc103979dca350 Mon Sep 17 00:00:00 2001 From: ganevgv Date: Sat, 23 Nov 2019 23:04:40 +0000 Subject: [PATCH 177/185] TST: add test for ffill/bfill for non unique multilevel (#29763) --- pandas/tests/groupby/test_transform.py | 35 ++++++++++++++++++++++++++ 1 file changed, 35 insertions(+) diff --git a/pandas/tests/groupby/test_transform.py b/pandas/tests/groupby/test_transform.py index 3d9a349d94e10..c46180c1d11cd 100644 --- a/pandas/tests/groupby/test_transform.py +++ b/pandas/tests/groupby/test_transform.py @@ -911,6 +911,41 @@ def test_pct_change(test_series, freq, periods, fill_method, limit): tm.assert_frame_equal(result, expected.to_frame("vals")) +@pytest.mark.parametrize( + "func, expected_status", + [ + ("ffill", ["shrt", "shrt", "lng", np.nan, "shrt", "ntrl", "ntrl"]), + ("bfill", ["shrt", "lng", "lng", "shrt", "shrt", "ntrl", np.nan]), + ], +) +def test_ffill_bfill_non_unique_multilevel(func, expected_status): + # GH 19437 + date = pd.to_datetime( + [ + "2018-01-01", + "2018-01-01", + "2018-01-01", + "2018-01-01", + "2018-01-02", + "2018-01-01", + "2018-01-02", + ] + ) + symbol = ["MSFT", "MSFT", "MSFT", "AAPL", "AAPL", "TSLA", "TSLA"] + status = ["shrt", np.nan, "lng", np.nan, "shrt", "ntrl", np.nan] + + df = DataFrame({"date": date, "symbol": symbol, "status": status}) + df = df.set_index(["date", "symbol"]) + result = getattr(df.groupby("symbol")["status"], func)() + + index = MultiIndex.from_tuples( + tuples=list(zip(*[date, symbol])), names=["date", "symbol"] + ) + expected = Series(expected_status, index=index, name="status") + + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize("func", [np.any, np.all]) def test_any_all_np_func(func): # GH 20653 From 76e39ebcf584042fab4f224a6bd2c903bb0c8aff Mon Sep 17 00:00:00 2001 From: Jeremy Schendel Date: Sat, 23 Nov 2019 16:10:00 -0700 Subject: [PATCH 178/185] BUG: Fix melt with mixed int/str columns (#29792) --- doc/source/whatsnew/v1.0.0.rst | 1 + pandas/core/reshape/melt.py | 5 +++-- pandas/tests/reshape/test_melt.py | 16 ++++++++++++++++ 3 files changed, 20 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index 28a61c535f951..77eb0b9fd9914 100644 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -604,6 +604,7 @@ Reshaping - Bug :meth:`Series.pct_change` where supplying an anchored frequency would throw a ValueError (:issue:`28664`) - Bug where :meth:`DataFrame.equals` returned True incorrectly in some cases when two DataFrames had the same columns in different orders (:issue:`28839`) - Bug in :meth:`DataFrame.replace` that caused non-numeric replacer's dtype not respected (:issue:`26632`) +- Bug in :func:`melt` where supplying mixed strings and numeric values for ``id_vars`` or ``value_vars`` would incorrectly raise a ``ValueError`` (:issue:`29718`) Sparse diff --git a/pandas/core/reshape/melt.py b/pandas/core/reshape/melt.py index 4cba52c5cd651..8e9edfa5f1409 100644 --- a/pandas/core/reshape/melt.py +++ b/pandas/core/reshape/melt.py @@ -11,6 +11,7 @@ from pandas.core.dtypes.missing import notna from pandas.core.arrays import Categorical +import pandas.core.common as com from pandas.core.frame import DataFrame, _shared_docs from pandas.core.indexes.base import Index from pandas.core.reshape.concat import concat @@ -47,7 +48,7 @@ def melt( else: # Check that `id_vars` are in frame id_vars = list(id_vars) - missing = Index(np.ravel(id_vars)).difference(cols) + missing = Index(com.flatten(id_vars)).difference(cols) if not missing.empty: raise KeyError( "The following 'id_vars' are not present" @@ -69,7 +70,7 @@ def melt( else: value_vars = list(value_vars) # Check that `value_vars` are in frame - missing = Index(np.ravel(value_vars)).difference(cols) + missing = Index(com.flatten(value_vars)).difference(cols) if not missing.empty: raise KeyError( "The following 'value_vars' are not present in" diff --git a/pandas/tests/reshape/test_melt.py b/pandas/tests/reshape/test_melt.py index 4521f1bbf1a08..d6946ea41ed84 100644 --- a/pandas/tests/reshape/test_melt.py +++ b/pandas/tests/reshape/test_melt.py @@ -317,6 +317,22 @@ def test_melt_missing_columns_raises(self): ): multi.melt(["A"], ["F"], col_level=0) + def test_melt_mixed_int_str_id_vars(self): + # GH 29718 + df = DataFrame({0: ["foo"], "a": ["bar"], "b": [1], "d": [2]}) + result = melt(df, id_vars=[0, "a"], value_vars=["b", "d"]) + expected = DataFrame( + {0: ["foo"] * 2, "a": ["bar"] * 2, "variable": list("bd"), "value": [1, 2]} + ) + tm.assert_frame_equal(result, expected) + + def test_melt_mixed_int_str_value_vars(self): + # GH 29718 + df = DataFrame({0: ["foo"], "a": ["bar"]}) + result = melt(df, value_vars=[0, "a"]) + expected = DataFrame({"variable": [0, "a"], "value": ["foo", "bar"]}) + tm.assert_frame_equal(result, expected) + class TestLreshape: def test_pairs(self): From cf202bec6bba8bfcf6e2f61bfe6f2817a2a67264 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Sat, 23 Nov 2019 15:10:38 -0800 Subject: [PATCH 179/185] STY: fstrings in io.pytables (#29758) --- pandas/io/pytables.py | 311 +++++++++++++++++------------------------- 1 file changed, 126 insertions(+), 185 deletions(-) diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index 8afbd293a095b..b229e5b4e0f4e 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -368,9 +368,7 @@ def read_hdf(path_or_buf, key=None, mode: str = "r", **kwargs): exists = False if not exists: - raise FileNotFoundError( - "File {path} does not exist".format(path=path_or_buf) - ) + raise FileNotFoundError(f"File {path_or_buf} does not exist") store = HDFStore(path_or_buf, mode=mode, **kwargs) # can't auto open/close if we are using an iterator @@ -485,9 +483,7 @@ def __init__( if complib is not None and complib not in tables.filters.all_complibs: raise ValueError( - "complib only supports {libs} compression.".format( - libs=tables.filters.all_complibs - ) + f"complib only supports {tables.filters.all_complibs} compression." ) if complib is None and complevel is not None: @@ -533,9 +529,7 @@ def __getattr__(self, name: str): except (KeyError, ClosedFileError): pass raise AttributeError( - "'{object}' object has no attribute '{name}'".format( - object=type(self).__name__, name=name - ) + f"'{type(self).__name__}' object has no attribute '{name}'" ) def __contains__(self, key: str): @@ -553,9 +547,8 @@ def __len__(self) -> int: return len(self.groups()) def __repr__(self) -> str: - return "{type}\nFile path: {path}\n".format( - type=type(self), path=pprint_thing(self._path) - ) + pstr = pprint_thing(self._path) + return f"{type(self)}\nFile path: {pstr}\n" def __enter__(self): return self @@ -607,8 +600,8 @@ def open(self, mode: str = "a", **kwargs): # this would truncate, raise here if self.is_open: raise PossibleDataLossError( - "Re-opening the file [{0}] with mode [{1}] " - "will delete the current file!".format(self._path, self._mode) + f"Re-opening the file [{self._path}] with mode [{self._mode}] " + "will delete the current file!" ) self._mode = mode @@ -626,7 +619,7 @@ def open(self, mode: str = "a", **kwargs): self._handle = tables.open_file(self._path, self._mode, **kwargs) except IOError as err: # pragma: no cover if "can not be written" in str(err): - print("Opening {path} in read-only mode".format(path=self._path)) + print(f"Opening {self._path} in read-only mode") self._handle = tables.open_file(self._path, "r", **kwargs) else: raise @@ -636,18 +629,16 @@ def open(self, mode: str = "a", **kwargs): # trap PyTables >= 3.1 FILE_OPEN_POLICY exception # to provide an updated message if "FILE_OPEN_POLICY" in str(err): + hdf_version = tables.get_hdf5_version() err = ValueError( - "PyTables [{version}] no longer supports opening multiple " - "files\n" + f"PyTables [{tables.__version__}] no longer supports " + "opening multiple files\n" "even in read-only mode on this HDF5 version " - "[{hdf_version}]. You can accept this\n" + f"[{hdf_version}]. You can accept this\n" "and not open the same file multiple times at once,\n" "upgrade the HDF5 version, or downgrade to PyTables 3.0.0 " "which allows\n" - "files to be opened multiple times at once\n".format( - version=tables.__version__, - hdf_version=tables.get_hdf5_version(), - ) + "files to be opened multiple times at once\n" ) raise err @@ -716,7 +707,7 @@ def get(self, key: str): """ group = self.get_node(key) if group is None: - raise KeyError("No object named {key} in the file".format(key=key)) + raise KeyError(f"No object named {key} in the file") return self._read_group(group) def select( @@ -760,7 +751,7 @@ def select( """ group = self.get_node(key) if group is None: - raise KeyError("No object named {key} in the file".format(key=key)) + raise KeyError(f"No object named {key} in the file") # create the storer and axes where = _ensure_term(where, scope_level=1) @@ -900,11 +891,11 @@ def select_as_multiple( nrows = None for t, k in itertools.chain([(s, selector)], zip(tbls, keys)): if t is None: - raise KeyError("Invalid table [{key}]".format(key=k)) + raise KeyError(f"Invalid table [{k}]") if not t.is_table: raise TypeError( - "object [{obj}] is not a table, and cannot be used in all " - "select as multiple".format(obj=t.pathname) + f"object [{t.pathname}] is not a table, and cannot be used in all " + "select as multiple" ) if nrows is None: @@ -1289,7 +1280,7 @@ def get_storer(self, key: str): """ return the storer object for a key, raise if not in the file """ group = self.get_node(key) if group is None: - raise KeyError("No object named {key} in the file".format(key=key)) + raise KeyError(f"No object named {key} in the file") s = self._create_storer(group) s.infer_axes() @@ -1365,9 +1356,9 @@ def info(self) -> str: ------- str """ - output = "{type}\nFile path: {path}\n".format( - type=type(self), path=pprint_thing(self._path) - ) + path = pprint_thing(self._path) + output = f"{type(self)}\nFile path: {path}\n" + if self.is_open: lkeys = sorted(self.keys()) if len(lkeys): @@ -1382,11 +1373,8 @@ def info(self) -> str: values.append(pprint_thing(s or "invalid_HDFStore node")) except Exception as detail: keys.append(k) - values.append( - "[invalid_HDFStore node: {detail}]".format( - detail=pprint_thing(detail) - ) - ) + dstr = pprint_thing(detail) + values.append(f"[invalid_HDFStore node: {dstr}]") output += adjoin(12, keys, values) else: @@ -1399,7 +1387,7 @@ def info(self) -> str: # private methods ###### def _check_if_open(self): if not self.is_open: - raise ClosedFileError("{0} file is not open!".format(self._path)) + raise ClosedFileError(f"{self._path} file is not open!") def _validate_format(self, format: str, kwargs: Dict[str, Any]) -> Dict[str, Any]: """ validate / deprecate formats; return the new kwargs """ @@ -1409,7 +1397,7 @@ def _validate_format(self, format: str, kwargs: Dict[str, Any]) -> Dict[str, Any try: kwargs["format"] = _FORMAT_MAP[format.lower()] except KeyError: - raise TypeError("invalid HDFStore format specified [{0}]".format(format)) + raise TypeError(f"invalid HDFStore format specified [{format}]") return kwargs @@ -1418,16 +1406,9 @@ def _create_storer(self, group, format=None, value=None, append=False, **kwargs) def error(t): raise TypeError( - "cannot properly create the storer for: [{t}] [group->" - "{group},value->{value},format->{format},append->{append}," - "kwargs->{kwargs}]".format( - t=t, - group=group, - value=type(value), - format=format, - append=append, - kwargs=kwargs, - ) + f"cannot properly create the storer for: [{t}] [group->" + f"{group},value->{type(value)},format->{format},append->{append}," + f"kwargs->{kwargs}]" ) pt = _ensure_decoded(getattr(group._v_attrs, "pandas_type", None)) @@ -1768,7 +1749,7 @@ def __repr__(self) -> str: ) return ",".join( ( - "{key}->{value}".format(key=key, value=value) + f"{key}->{value}" for key, value in zip(["name", "cname", "axis", "pos", "kind"], temp) ) ) @@ -1898,12 +1879,10 @@ def validate_col(self, itemsize=None): itemsize = self.itemsize if c.itemsize < itemsize: raise ValueError( - "Trying to store a string with len [{itemsize}] in " - "[{cname}] column but\nthis column has a limit of " - "[{c_itemsize}]!\nConsider using min_itemsize to " - "preset the sizes on these columns".format( - itemsize=itemsize, cname=self.cname, c_itemsize=c.itemsize - ) + f"Trying to store a string with len [{itemsize}] in " + f"[{self.cname}] column but\nthis column has a limit of " + f"[{c.itemsize}]!\nConsider using min_itemsize to " + "preset the sizes on these columns" ) return c.itemsize @@ -1915,8 +1894,7 @@ def validate_attr(self, append: bool): existing_kind = getattr(self.attrs, self.kind_attr, None) if existing_kind is not None and existing_kind != self.kind: raise TypeError( - "incompatible kind in col [{existing} - " - "{self_kind}]".format(existing=existing_kind, self_kind=self.kind) + f"incompatible kind in col [{existing_kind} - {self.kind}]" ) def update_info(self, info): @@ -1942,14 +1920,9 @@ def update_info(self, info): else: raise ValueError( - "invalid info for [{name}] for [{key}], " - "existing_value [{existing_value}] conflicts with " - "new value [{value}]".format( - name=self.name, - key=key, - existing_value=existing_value, - value=value, - ) + f"invalid info for [{self.name}] for [{key}], " + f"existing_value [{existing_value}] conflicts with " + f"new value [{value}]" ) else: if value is not None or existing_value is not None: @@ -2060,7 +2033,7 @@ def create_for_block(cls, i=None, name=None, cname=None, version=None, **kwargs) """ return a new datacol with the block i """ if cname is None: - cname = name or "values_block_{idx}".format(idx=i) + cname = name or f"values_block_{i}" if name is None: name = cname @@ -2070,7 +2043,8 @@ def create_for_block(cls, i=None, name=None, cname=None, version=None, **kwargs) if version[0] == 0 and version[1] <= 10 and version[2] == 0: m = re.search(r"values_block_(\d+)", name) if m: - name = "values_{group}".format(group=m.groups()[0]) + grp = m.groups()[0] + name = f"values_{grp}" except IndexError: pass @@ -2090,9 +2064,9 @@ def __init__( ): super().__init__(values=values, kind=kind, typ=typ, cname=cname, **kwargs) self.dtype = None - self.dtype_attr = "{name}_dtype".format(name=self.name) + self.dtype_attr = f"{self.name}_dtype" self.meta = meta - self.meta_attr = "{name}_meta".format(name=self.name) + self.meta_attr = f"{self.name}_meta" self.set_data(data) self.set_metadata(metadata) @@ -2104,7 +2078,7 @@ def __repr__(self) -> str: ) return ",".join( ( - "{key}->{value}".format(key=key, value=value) + f"{key}->{value}" for key, value in zip(["name", "cname", "dtype", "kind", "shape"], temp) ) ) @@ -2158,11 +2132,7 @@ def set_kind(self): elif dtype.startswith("bool"): self.kind = "bool" else: - raise AssertionError( - "cannot interpret dtype of [{dtype}] in [{obj}]".format( - dtype=dtype, obj=self - ) - ) + raise AssertionError(f"cannot interpret dtype of [{dtype}] in [{self}]") # set my typ if we need if self.typ is None: @@ -2253,10 +2223,8 @@ def set_atom_string( inferred_type = lib.infer_dtype(col.ravel(), skipna=False) if inferred_type != "string": raise TypeError( - "Cannot serialize the column [{item}] because\n" - "its data contents are [{type}] object dtype".format( - item=item, type=inferred_type - ) + f"Cannot serialize the column [{item}] because\n" + f"its data contents are [{inferred_type}] object dtype" ) # itemsize is the maximum length of a string (along any dimension) @@ -2279,18 +2247,18 @@ def set_atom_string( self.itemsize = itemsize self.kind = "string" self.typ = self.get_atom_string(block, itemsize) - self.set_data( - data_converted.astype("|S{size}".format(size=itemsize), copy=False) - ) + self.set_data(data_converted.astype(f"|S{itemsize}", copy=False)) def get_atom_coltype(self, kind=None): """ return the PyTables column class for this column """ if kind is None: kind = self.kind if self.kind.startswith("uint"): - col_name = "UInt{name}Col".format(name=kind[4:]) + k4 = kind[4:] + col_name = f"UInt{k4}Col" else: - col_name = "{name}Col".format(name=kind.capitalize()) + kcap = kind.capitalize() + col_name = f"{kcap}Col" return getattr(_tables(), col_name) @@ -2568,10 +2536,9 @@ def __repr__(self) -> str: s = self.shape if s is not None: if isinstance(s, (list, tuple)): - s = "[{shape}]".format(shape=",".join(pprint_thing(x) for x in s)) - return "{type:12.12} (shape->{shape})".format( - type=self.pandas_type, shape=s - ) + jshape = ",".join(pprint_thing(x) for x in s) + s = f"[{jshape}]" + return f"{self.pandas_type:12.12} (shape->{s})" return self.pandas_type def set_object_info(self): @@ -2798,7 +2765,7 @@ def read_array( return ret def read_index(self, key, **kwargs): - variety = _ensure_decoded(getattr(self.attrs, "{key}_variety".format(key=key))) + variety = _ensure_decoded(getattr(self.attrs, f"{key}_variety")) if variety == "multi": return self.read_multi_index(key, **kwargs) @@ -2810,22 +2777,20 @@ def read_index(self, key, **kwargs): _, index = self.read_index_node(getattr(self.group, key), **kwargs) return index else: # pragma: no cover - raise TypeError( - "unrecognized index variety: {variety}".format(variety=variety) - ) + raise TypeError(f"unrecognized index variety: {variety}") def write_index(self, key, index): if isinstance(index, MultiIndex): - setattr(self.attrs, "{key}_variety".format(key=key), "multi") + setattr(self.attrs, f"{key}_variety", "multi") self.write_multi_index(key, index) elif isinstance(index, BlockIndex): - setattr(self.attrs, "{key}_variety".format(key=key), "block") + setattr(self.attrs, f"{key}_variety", "block") self.write_block_index(key, index) elif isinstance(index, IntIndex): - setattr(self.attrs, "{key}_variety".format(key=key), "sparseint") + setattr(self.attrs, f"{key}_variety", "sparseint") self.write_sparse_intindex(key, index) else: - setattr(self.attrs, "{key}_variety".format(key=key), "regular") + setattr(self.attrs, f"{key}_variety", "regular") converted = _convert_index( "index", index, self.encoding, self.errors, self.format_type ) @@ -2846,27 +2811,27 @@ def write_index(self, key, index): node._v_attrs.tz = _get_tz(index.tz) def write_block_index(self, key, index): - self.write_array("{key}_blocs".format(key=key), index.blocs) - self.write_array("{key}_blengths".format(key=key), index.blengths) - setattr(self.attrs, "{key}_length".format(key=key), index.length) + self.write_array(f"{key}_blocs", index.blocs) + self.write_array(f"{key}_blengths", index.blengths) + setattr(self.attrs, f"{key}_length", index.length) def read_block_index(self, key, **kwargs) -> BlockIndex: - length = getattr(self.attrs, "{key}_length".format(key=key)) - blocs = self.read_array("{key}_blocs".format(key=key), **kwargs) - blengths = self.read_array("{key}_blengths".format(key=key), **kwargs) + length = getattr(self.attrs, f"{key}_length") + blocs = self.read_array(f"{key}_blocs", **kwargs) + blengths = self.read_array(f"{key}_blengths", **kwargs) return BlockIndex(length, blocs, blengths) def write_sparse_intindex(self, key, index): - self.write_array("{key}_indices".format(key=key), index.indices) - setattr(self.attrs, "{key}_length".format(key=key), index.length) + self.write_array(f"{key}_indices", index.indices) + setattr(self.attrs, f"{key}_length", index.length) def read_sparse_intindex(self, key, **kwargs) -> IntIndex: - length = getattr(self.attrs, "{key}_length".format(key=key)) - indices = self.read_array("{key}_indices".format(key=key), **kwargs) + length = getattr(self.attrs, f"{key}_length") + indices = self.read_array(f"{key}_indices", **kwargs) return IntIndex(length, indices) def write_multi_index(self, key, index): - setattr(self.attrs, "{key}_nlevels".format(key=key), index.nlevels) + setattr(self.attrs, f"{key}_nlevels", index.nlevels) for i, (lev, level_codes, name) in enumerate( zip(index.levels, index.codes, index.names) @@ -2876,7 +2841,7 @@ def write_multi_index(self, key, index): raise NotImplementedError( "Saving a MultiIndex with an extension dtype is not supported." ) - level_key = "{key}_level{idx}".format(key=key, idx=i) + level_key = f"{key}_level{i}" conv_level = _convert_index( level_key, lev, self.encoding, self.errors, self.format_type ) @@ -2886,25 +2851,25 @@ def write_multi_index(self, key, index): node._v_attrs.name = name # write the name - setattr(node._v_attrs, "{key}_name{name}".format(key=key, name=name), name) + setattr(node._v_attrs, f"{key}_name{name}", name) # write the labels - label_key = "{key}_label{idx}".format(key=key, idx=i) + label_key = f"{key}_label{i}" self.write_array(label_key, level_codes) def read_multi_index(self, key, **kwargs) -> MultiIndex: - nlevels = getattr(self.attrs, "{key}_nlevels".format(key=key)) + nlevels = getattr(self.attrs, f"{key}_nlevels") levels = [] codes = [] names = [] for i in range(nlevels): - level_key = "{key}_level{idx}".format(key=key, idx=i) + level_key = f"{key}_level{i}" name, lev = self.read_index_node(getattr(self.group, level_key), **kwargs) levels.append(lev) names.append(name) - label_key = "{key}_label{idx}".format(key=key, idx=i) + label_key = f"{key}_label{i}" level_codes = self.read_array(label_key, **kwargs) codes.append(level_codes) @@ -3098,7 +3063,7 @@ def shape(self): # items items = 0 for i in range(self.nblocks): - node = getattr(self.group, "block{idx}_items".format(idx=i)) + node = getattr(self.group, f"block{i}_items") shape = getattr(node, "shape", None) if shape is not None: items += shape[0] @@ -3131,17 +3096,15 @@ def read(self, start=None, stop=None, **kwargs): for i in range(self.ndim): _start, _stop = (start, stop) if i == select_axis else (None, None) - ax = self.read_index("axis{idx}".format(idx=i), start=_start, stop=_stop) + ax = self.read_index(f"axis{i}", start=_start, stop=_stop) axes.append(ax) items = axes[0] blocks = [] for i in range(self.nblocks): - blk_items = self.read_index("block{idx}_items".format(idx=i)) - values = self.read_array( - "block{idx}_values".format(idx=i), start=_start, stop=_stop - ) + blk_items = self.read_index(f"block{i}_items") + values = self.read_array(f"block{i}_values", start=_start, stop=_stop) blk = make_block( values, placement=items.get_indexer(blk_items), ndim=len(axes) ) @@ -3160,17 +3123,15 @@ def write(self, obj, **kwargs): if i == 0: if not ax.is_unique: raise ValueError("Columns index has to be unique for fixed format") - self.write_index("axis{idx}".format(idx=i), ax) + self.write_index(f"axis{i}", ax) # Supporting mixed-type DataFrame objects...nontrivial self.attrs.nblocks = len(data.blocks) for i, blk in enumerate(data.blocks): # I have no idea why, but writing values before items fixed #2299 blk_items = data.items.take(blk.mgr_locs) - self.write_array( - "block{idx}_values".format(idx=i), blk.values, items=blk_items - ) - self.write_index("block{idx}_items".format(idx=i), blk_items) + self.write_array(f"block{i}_values", blk.values, items=blk_items) + self.write_index(f"block{i}_items", blk_items) class FrameFixed(BlockManagerFixed): @@ -3231,25 +3192,19 @@ def format_type(self) -> str: def __repr__(self) -> str: """ return a pretty representation of myself """ self.infer_axes() - dc = ",dc->[{columns}]".format( - columns=(",".join(self.data_columns) if len(self.data_columns) else "") - ) + jdc = ",".join(self.data_columns) if len(self.data_columns) else "" + dc = f",dc->[{jdc}]" ver = "" if self.is_old_version: - ver = "[{version}]".format(version=".".join(str(x) for x in self.version)) + jver = ".".join(str(x) for x in self.version) + ver = f"[{jver}]" + jindex_axes = ",".join(a.name for a in self.index_axes) return ( - "{pandas_type:12.12}{ver} (typ->{table_type},nrows->{nrows}," - "ncols->{ncols},indexers->[{index_axes}]{dc})".format( - pandas_type=self.pandas_type, - ver=ver, - table_type=self.table_type_short, - nrows=self.nrows, - ncols=self.ncols, - index_axes=(",".join(a.name for a in self.index_axes)), - dc=dc, - ) + f"{self.pandas_type:12.12}{ver} " + f"(typ->{self.table_type_short},nrows->{self.nrows}," + f"ncols->{self.ncols},indexers->[{jindex_axes}]{dc})" ) def __getitem__(self, c): @@ -3267,9 +3222,7 @@ def validate(self, other): if other.table_type != self.table_type: raise TypeError( "incompatible table_type with existing " - "[{other} - {self}]".format( - other=other.table_type, self=self.table_type - ) + f"[{other.table_type} - {self.table_type}]" ) for c in ["index_axes", "non_index_axes", "values_axes"]: @@ -3282,16 +3235,14 @@ def validate(self, other): oax = ov[i] if sax != oax: raise ValueError( - "invalid combinate of [{c}] on appending data " - "[{sax}] vs current table [{oax}]".format( - c=c, sax=sax, oax=oax - ) + f"invalid combinate of [{c}] on appending data " + f"[{sax}] vs current table [{oax}]" ) # should never get here raise Exception( - "invalid combinate of [{c}] on appending data [{sv}] vs " - "current table [{ov}]".format(c=c, sv=sv, ov=ov) + f"invalid combinate of [{c}] on appending data [{sv}] vs " + f"current table [{ov}]" ) @property @@ -3308,8 +3259,7 @@ def validate_multiindex(self, obj): new object """ levels = [ - l if l is not None else "level_{0}".format(i) - for i, l in enumerate(obj.index.names) + l if l is not None else f"level_{i}" for i, l in enumerate(obj.index.names) ] try: return obj.reset_index(), levels @@ -3396,7 +3346,8 @@ def values_cols(self) -> List[str]: def _get_metadata_path(self, key) -> str: """ return the metadata pathname for this key """ - return "{group}/meta/{key}/meta".format(group=self.group._v_pathname, key=key) + group = self.group._v_pathname + return f"{group}/meta/{key}/meta" def write_metadata(self, key: str, values): """ @@ -3476,8 +3427,8 @@ def validate_min_itemsize(self, min_itemsize): continue if k not in q: raise ValueError( - "min_itemsize has the key [{key}] which is not an axis or " - "data_column".format(key=k) + f"min_itemsize has the key [{k}] which is not an axis or " + "data_column" ) @property @@ -3646,8 +3597,8 @@ def validate_data_columns(self, data_columns, min_itemsize): info = self.info.get(axis, dict()) if info.get("type") == "MultiIndex" and data_columns: raise ValueError( - "cannot use a multi-index on axis [{0}] with " - "data_columns {1}".format(axis, data_columns) + f"cannot use a multi-index on axis [{axis}] with " + f"data_columns {data_columns}" ) # evaluate the passed data_columns, True == use all columns @@ -3706,9 +3657,10 @@ def create_axes( try: axes = _AXES_MAP[type(obj)] except KeyError: + group = self.group._v_name raise TypeError( - "cannot properly create the storer for: [group->{group}," - "value->{value}]".format(group=self.group._v_name, value=type(obj)) + f"cannot properly create the storer for: [group->{group}," + f"value->{type(obj)}]" ) # map axes to numbers @@ -3834,11 +3786,10 @@ def get_blk_items(mgr, blocks): new_blocks.append(b) new_blk_items.append(b_items) except (IndexError, KeyError): + jitems = ",".join(pprint_thing(item) for item in items) raise ValueError( - "cannot match existing table structure for [{items}] " - "on appending data".format( - items=(",".join(pprint_thing(item) for item in items)) - ) + f"cannot match existing table structure for [{jitems}] " + "on appending data" ) blocks = new_blocks blk_items = new_blk_items @@ -3867,10 +3818,8 @@ def get_blk_items(mgr, blocks): existing_col = existing_table.values_axes[i] except (IndexError, KeyError): raise ValueError( - "Incompatible appended table [{blocks}]" - "with existing table [{table}]".format( - blocks=blocks, table=existing_table.values_axes - ) + f"Incompatible appended table [{blocks}]" + f"with existing table [{existing_table.values_axes}]" ) else: existing_col = None @@ -3954,10 +3903,7 @@ def process_filter(field, filt): takers = op(values, filt) return obj.loc(axis=axis_number)[takers] - raise ValueError( - "cannot find the field [{field}] for " - "filtering!".format(field=field) - ) + raise ValueError(f"cannot find the field [{field}] for filtering!") obj = process_filter(field, filt) @@ -4052,8 +3998,8 @@ def read_column( if not a.is_data_indexable: raise ValueError( - "column [{column}] can not be extracted individually; " - "it is not data indexable".format(column=column) + f"column [{column}] can not be extracted individually; " + "it is not data indexable" ) # column must be an indexable or a data column @@ -4067,7 +4013,7 @@ def read_column( ) return Series(_set_tz(a.take_data(), a.tz, True), name=column) - raise KeyError("column [{column}] not found in the table".format(column=column)) + raise KeyError(f"column [{column}] not found in the table") class WORMTable(Table): @@ -4264,16 +4210,14 @@ def write_data_chunk(self, rows, indexes, mask, values): rows = rows[m] except Exception as detail: - raise Exception("cannot create row-data -> {detail}".format(detail=detail)) + raise Exception(f"cannot create row-data -> {detail}") try: if len(rows): self.table.append(rows) self.table.flush() except Exception as detail: - raise TypeError( - "tables cannot write this data -> {detail}".format(detail=detail) - ) + raise TypeError(f"tables cannot write this data -> {detail}") def delete( self, @@ -4733,9 +4677,7 @@ def _convert_index(name: str, index, encoding=None, errors="strict", format_type index_name=index_name, ) raise TypeError( - "[unicode] is not supported as a in index type for [{0}] formats".format( - format_type - ) + f"[unicode] is not supported as a in index type for [{format_type}] formats" ) elif inferred_type == "integer": @@ -4786,7 +4728,7 @@ def _unconvert_index(data, kind, encoding=None, errors="strict"): elif kind == "object": index = np.asarray(data[0]) else: # pragma: no cover - raise ValueError("unrecognized index type {kind}".format(kind=kind)) + raise ValueError(f"unrecognized index type {kind}") return index @@ -4818,7 +4760,7 @@ def _convert_string_array(data, encoding, errors, itemsize=None): ensured = ensure_object(data.ravel()) itemsize = max(1, libwriters.max_len_string_array(ensured)) - data = np.asarray(data, dtype="S{size}".format(size=itemsize)) + data = np.asarray(data, dtype=f"S{itemsize}") return data @@ -4847,7 +4789,7 @@ def _unconvert_string_array(data, nan_rep=None, encoding=None, errors="strict"): if encoding is not None and len(data): itemsize = libwriters.max_len_string_array(ensure_object(data)) - dtype = "U{0}".format(itemsize) + dtype = f"U{itemsize}" if isinstance(data[0], bytes): data = Series(data).str.decode(encoding, errors=errors).values @@ -4960,16 +4902,15 @@ def generate(self, where): except NameError: # raise a nice message, suggesting that the user should use # data_columns + qkeys = ",".join(q.keys()) raise ValueError( - "The passed where expression: {0}\n" + f"The passed where expression: {where}\n" " contains an invalid variable reference\n" " all of the variable references must be a " "reference to\n" " an axis (e.g. 'index' or 'columns'), or a " "data_column\n" - " The currently defined references are: {1}\n".format( - where, ",".join(q.keys()) - ) + f" The currently defined references are: {qkeys}\n" ) def select(self): From a522b5c1d42e9efb16f44b3e991294b54a4fe035 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Sat, 23 Nov 2019 15:52:21 -0800 Subject: [PATCH 180/185] DEPR: passing an int to read_excel use_cols (#29795) --- doc/source/whatsnew/v1.0.0.rst | 1 + pandas/io/excel/_util.py | 15 ++++----------- pandas/tests/io/excel/test_readers.py | 19 +++++-------------- 3 files changed, 10 insertions(+), 25 deletions(-) diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index 77eb0b9fd9914..cd7c78112252d 100644 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -391,6 +391,7 @@ or ``matplotlib.Axes.plot``. See :ref:`plotting.formatters` for more. - Removed the previously deprecated :meth:`ExtensionArray._formatting_values`. Use :attr:`ExtensionArray._formatter` instead. (:issue:`23601`) - Removed support for legacy HDF5 formats (:issue:`29787`) - :func:`read_excel` removed support for "skip_footer" argument, use "skipfooter" instead (:issue:`18836`) +- :func:`read_excel` no longer allows an integer value for the parameter ``usecols``, instead pass a list of integers from 0 to ``usecols`` inclusive (:issue:`23635`) - :meth:`DataFrame.to_records` no longer supports the argument "convert_datetime64" (:issue:`18902`) - Removed the previously deprecated ``IntervalIndex.from_intervals`` in favor of the :class:`IntervalIndex` constructor (:issue:`19263`) - Changed the default value for the "keep_tz" argument in :meth:`DatetimeIndex.to_series` to ``True`` (:issue:`23739`) diff --git a/pandas/io/excel/_util.py b/pandas/io/excel/_util.py index 2ba3842d5c0c9..ee617d2013136 100644 --- a/pandas/io/excel/_util.py +++ b/pandas/io/excel/_util.py @@ -1,5 +1,3 @@ -import warnings - from pandas.compat._optional import import_optional_dependency from pandas.core.dtypes.common import is_integer, is_list_like @@ -136,16 +134,11 @@ def _maybe_convert_usecols(usecols): return usecols if is_integer(usecols): - warnings.warn( - ( - "Passing in an integer for `usecols` has been " - "deprecated. Please pass in a list of int from " - "0 to `usecols` inclusive instead." - ), - FutureWarning, - stacklevel=2, + raise ValueError( + "Passing an integer for `usecols` is no longer supported. " + "Please pass in a list of int from 0 to `usecols` " + "inclusive instead." ) - return list(range(usecols + 1)) if isinstance(usecols, str): return _range2cols(usecols) diff --git a/pandas/tests/io/excel/test_readers.py b/pandas/tests/io/excel/test_readers.py index f6d94c4452076..e4b7d683b4c3b 100644 --- a/pandas/tests/io/excel/test_readers.py +++ b/pandas/tests/io/excel/test_readers.py @@ -88,27 +88,18 @@ def test_usecols_int(self, read_ext, df_ref): df_ref = df_ref.reindex(columns=["A", "B", "C"]) # usecols as int - with tm.assert_produces_warning( - FutureWarning, check_stacklevel=False, raise_on_extra_warnings=False - ): + msg = "Passing an integer for `usecols`" + with pytest.raises(ValueError, match=msg): with ignore_xlrd_time_clock_warning(): - df1 = pd.read_excel( - "test1" + read_ext, "Sheet1", index_col=0, usecols=3 - ) + pd.read_excel("test1" + read_ext, "Sheet1", index_col=0, usecols=3) # usecols as int - with tm.assert_produces_warning( - FutureWarning, check_stacklevel=False, raise_on_extra_warnings=False - ): + with pytest.raises(ValueError, match=msg): with ignore_xlrd_time_clock_warning(): - df2 = pd.read_excel( + pd.read_excel( "test1" + read_ext, "Sheet2", skiprows=[1], index_col=0, usecols=3 ) - # TODO add index to xls file) - tm.assert_frame_equal(df1, df_ref, check_names=False) - tm.assert_frame_equal(df2, df_ref, check_names=False) - def test_usecols_list(self, read_ext, df_ref): df_ref = df_ref.reindex(columns=["B", "C"]) From 35029d20d53a1af57823759039f57d8ff09736c3 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Sun, 24 Nov 2019 00:24:04 -0800 Subject: [PATCH 181/185] DEPR: remove Index.summary (#29807) --- doc/source/whatsnew/v1.0.0.rst | 1 + pandas/core/indexes/base.py | 13 ------------- pandas/tests/indexes/test_base.py | 7 ------- 3 files changed, 1 insertion(+), 20 deletions(-) diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index cd7c78112252d..f231c2b31abb1 100644 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -376,6 +376,7 @@ or ``matplotlib.Axes.plot``. See :ref:`plotting.formatters` for more. **Other removals** +- Removed the previously deprecated :meth:`Index.summary` (:issue:`18217`) - Removed the previously deprecated :meth:`Series.get_value`, :meth:`Series.set_value`, :meth:`DataFrame.get_value`, :meth:`DataFrame.set_value` (:issue:`17739`) - Changed the the default value of `inplace` in :meth:`DataFrame.set_index` and :meth:`Series.set_axis`. It now defaults to False (:issue:`27600`) - Removed support for nested renaming in :meth:`DataFrame.aggregate`, :meth:`Series.aggregate`, :meth:`DataFrameGroupBy.aggregate`, :meth:`SeriesGroupBy.aggregate`, :meth:`Rolling.aggregate` (:issue:`18529`) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 10c0f465f69da..dd38bd0ee5f70 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -1132,19 +1132,6 @@ def _summary(self, name=None): name = type(self).__name__ return f"{name}: {len(self)} entries{index_summary}" - def summary(self, name=None): - """ - Return a summarized representation. - - .. deprecated:: 0.23.0 - """ - warnings.warn( - "'summary' is deprecated and will be removed in a future version.", - FutureWarning, - stacklevel=2, - ) - return self._summary(name) - # -------------------------------------------------------------------- # Conversion Methods diff --git a/pandas/tests/indexes/test_base.py b/pandas/tests/indexes/test_base.py index facc025409f08..15844df5d7b04 100644 --- a/pandas/tests/indexes/test_base.py +++ b/pandas/tests/indexes/test_base.py @@ -1385,13 +1385,6 @@ def test_summary_bug(self): assert "~:{range}:0" in result assert "{other}%s" in result - # GH18217 - def test_summary_deprecated(self): - ind = Index(["{other}%s", "~:{range}:0"], name="A") - - with tm.assert_produces_warning(FutureWarning): - ind.summary() - def test_format(self, indices): self._check_method_works(Index.format, indices) From 75ac56a19bbdb3c4cebfcaea182da31a7a7de8c5 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Sun, 24 Nov 2019 23:55:16 -0800 Subject: [PATCH 182/185] DEPR: remove statsmodels/seaborn compat shims (#29822) --- pandas/core/api.py | 2 -- pandas/core/series.py | 17 ----------------- pandas/tests/series/test_missing.py | 6 ------ 3 files changed, 25 deletions(-) diff --git a/pandas/core/api.py b/pandas/core/api.py index 04f2f84c92a15..7df2165201a99 100644 --- a/pandas/core/api.py +++ b/pandas/core/api.py @@ -10,7 +10,6 @@ ) from pandas.core.dtypes.missing import isna, isnull, notna, notnull -# TODO: Remove get_dummies import when statsmodels updates #18264 from pandas.core.algorithms import factorize, unique, value_counts from pandas.core.arrays import Categorical from pandas.core.arrays.integer import ( @@ -45,7 +44,6 @@ from pandas.core.indexes.period import Period, period_range from pandas.core.indexes.timedeltas import Timedelta, timedelta_range from pandas.core.indexing import IndexSlice -from pandas.core.reshape.reshape import get_dummies from pandas.core.series import Series from pandas.core.tools.datetimes import to_datetime from pandas.core.tools.numeric import to_numeric diff --git a/pandas/core/series.py b/pandas/core/series.py index 6045d6a654508..1843ffb1afaec 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -96,23 +96,6 @@ ) -# see gh-16971 -def remove_na(arr): - """ - Remove null values from array like structure. - - .. deprecated:: 0.21.0 - Use s[s.notnull()] instead. - """ - - warnings.warn( - "remove_na is deprecated and is a private function. Do not use.", - FutureWarning, - stacklevel=2, - ) - return remove_na_arraylike(arr) - - def _coerce_method(converter): """ Install the scalar coercion methods. diff --git a/pandas/tests/series/test_missing.py b/pandas/tests/series/test_missing.py index 0751e1fb8b906..81bf1edbe86df 100644 --- a/pandas/tests/series/test_missing.py +++ b/pandas/tests/series/test_missing.py @@ -20,7 +20,6 @@ date_range, isna, ) -from pandas.core.series import remove_na import pandas.util.testing as tm @@ -48,11 +47,6 @@ def _simple_ts(start, end, freq="D"): class TestSeriesMissingData: - def test_remove_na_deprecation(self): - # see gh-16971 - with tm.assert_produces_warning(FutureWarning): - remove_na(Series([])) - def test_timedelta_fillna(self): # GH 3371 s = Series( From cc3daa6f7e140e870d57c3b02a5d2142e11d09c9 Mon Sep 17 00:00:00 2001 From: Max Chen Date: Mon, 25 Nov 2019 21:54:21 +0800 Subject: [PATCH 183/185] ENH: Add built-in function for Styler to format the text displayed for missing values (#29118) * Add built-in funcion for Styler to format the text displayed for missing values As described in GH #28358, user who wants to control how NA values are printed while applying styles to the output will have to implement their own formatter. (so that the underlying data will not change and can be used for styling) --- doc/source/reference/style.rst | 1 + doc/source/user_guide/style.ipynb | 60 ++++++++++++++++++++++ doc/source/whatsnew/v1.0.0.rst | 1 + pandas/io/formats/style.py | 73 ++++++++++++++++++++++----- pandas/tests/io/formats/test_style.py | 69 +++++++++++++++++++++++++ 5 files changed, 191 insertions(+), 13 deletions(-) diff --git a/doc/source/reference/style.rst b/doc/source/reference/style.rst index 3d155535e2585..24a47336b0522 100644 --- a/doc/source/reference/style.rst +++ b/doc/source/reference/style.rst @@ -41,6 +41,7 @@ Style application Styler.set_caption Styler.set_properties Styler.set_uuid + Styler.set_na_rep Styler.clear Styler.pipe diff --git a/doc/source/user_guide/style.ipynb b/doc/source/user_guide/style.ipynb index e0dc2e734e660..5e026e3a7d78f 100644 --- a/doc/source/user_guide/style.ipynb +++ b/doc/source/user_guide/style.ipynb @@ -67,6 +67,7 @@ "df = pd.DataFrame({'A': np.linspace(1, 10, 10)})\n", "df = pd.concat([df, pd.DataFrame(np.random.randn(10, 4), columns=list('BCDE'))],\n", " axis=1)\n", + "df.iloc[3, 3] = np.nan\n", "df.iloc[0, 2] = np.nan" ] }, @@ -402,6 +403,38 @@ "df.style.format({\"B\": lambda x: \"±{:.2f}\".format(abs(x))})" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "You can format the text displayed for missing values by `na_rep`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "df.style.format(\"{:.2%}\", na_rep=\"-\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "These formatting techniques can be used in combination with styling." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "df.style.highlight_max().format(None, na_rep=\"-\")" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -659,6 +692,7 @@ "- precision\n", "- captions\n", "- table-wide styles\n", + "- missing values representation\n", "- hiding the index or columns\n", "\n", "Each of these can be specified in two ways:\n", @@ -800,6 +834,32 @@ "We hope to collect some useful ones either in pandas, or preferable in a new package that [builds on top](#Extensibility) the tools here." ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Missing values" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "You can control the default missing values representation for the entire table through `set_na_rep` method." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "(df.style\n", + " .set_na_rep(\"FAIL\")\n", + " .format(None, na_rep=\"PASS\", subset=[\"D\"])\n", + " .highlight_null(\"yellow\"))" + ] + }, { "cell_type": "markdown", "metadata": {}, diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index f231c2b31abb1..3990eec2435d9 100644 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -122,6 +122,7 @@ Other enhancements - Added ``encoding`` argument to :meth:`DataFrame.to_string` for non-ascii text (:issue:`28766`) - Added ``encoding`` argument to :func:`DataFrame.to_html` for non-ascii text (:issue:`28663`) - :meth:`Styler.background_gradient` now accepts ``vmin`` and ``vmax`` arguments (:issue:`12145`) +- :meth:`Styler.format` added the ``na_rep`` parameter to help format the missing values (:issue:`21527`, :issue:`28358`) - Roundtripping DataFrames with nullable integer or string data types to parquet (:meth:`~DataFrame.to_parquet` / :func:`read_parquet`) using the `'pyarrow'` engine now preserve those data types with pyarrow >= 1.0.0 (:issue:`20612`). diff --git a/pandas/io/formats/style.py b/pandas/io/formats/style.py index 6fc4e21d33d16..ebe86a7f535cb 100644 --- a/pandas/io/formats/style.py +++ b/pandas/io/formats/style.py @@ -8,7 +8,7 @@ import copy from functools import partial from itertools import product -from typing import Optional +from typing import Any, Callable, DefaultDict, Dict, List, Optional, Sequence, Tuple from uuid import uuid1 import numpy as np @@ -71,6 +71,11 @@ class Styler: The ``id`` takes the form ``T__row_col`` where ```` is the unique identifier, ```` is the row number and ```` is the column number. + na_rep : str, optional + Representation for missing values. + If ``na_rep`` is None, no special formatting is applied + + .. versionadded:: 1.0.0 Attributes ---------- @@ -126,9 +131,10 @@ def __init__( caption=None, table_attributes=None, cell_ids=True, + na_rep: Optional[str] = None, ): - self.ctx = defaultdict(list) - self._todo = [] + self.ctx: DefaultDict[Tuple[int, int], List[str]] = defaultdict(list) + self._todo: List[Tuple[Callable, Tuple, Dict]] = [] if not isinstance(data, (pd.Series, pd.DataFrame)): raise TypeError("``data`` must be a Series or DataFrame") @@ -149,19 +155,24 @@ def __init__( self.precision = precision self.table_attributes = table_attributes self.hidden_index = False - self.hidden_columns = [] + self.hidden_columns: Sequence[int] = [] self.cell_ids = cell_ids + self.na_rep = na_rep # display_funcs maps (row, col) -> formatting function def default_display_func(x): - if is_float(x): + if self.na_rep is not None and pd.isna(x): + return self.na_rep + elif is_float(x): display_format = "{0:.{precision}f}".format(x, precision=self.precision) return display_format else: return x - self._display_funcs = defaultdict(lambda: default_display_func) + self._display_funcs: DefaultDict[ + Tuple[int, int], Callable[[Any], str] + ] = defaultdict(lambda: default_display_func) def _repr_html_(self): """ @@ -416,16 +427,22 @@ def format_attr(pair): table_attributes=table_attr, ) - def format(self, formatter, subset=None): + def format(self, formatter, subset=None, na_rep: Optional[str] = None): """ Format the text display value of cells. Parameters ---------- - formatter : str, callable, or dict + formatter : str, callable, dict or None + If ``formatter`` is None, the default formatter is used subset : IndexSlice An argument to ``DataFrame.loc`` that restricts which elements ``formatter`` is applied to. + na_rep : str, optional + Representation for missing values. + If ``na_rep`` is None, no special formatting is applied + + .. versionadded:: 1.0.0 Returns ------- @@ -451,6 +468,10 @@ def format(self, formatter, subset=None): >>> df['c'] = ['a', 'b', 'c', 'd'] >>> df.style.format({'c': str.upper}) """ + if formatter is None: + assert self._display_funcs.default_factory is not None + formatter = self._display_funcs.default_factory() + if subset is None: row_locs = range(len(self.data)) col_locs = range(len(self.data.columns)) @@ -466,16 +487,16 @@ def format(self, formatter, subset=None): if is_dict_like(formatter): for col, col_formatter in formatter.items(): # formatter must be callable, so '{}' are converted to lambdas - col_formatter = _maybe_wrap_formatter(col_formatter) + col_formatter = _maybe_wrap_formatter(col_formatter, na_rep) col_num = self.data.columns.get_indexer_for([col])[0] for row_num in row_locs: self._display_funcs[(row_num, col_num)] = col_formatter else: # single scalar to format all cells with + formatter = _maybe_wrap_formatter(formatter, na_rep) locs = product(*(row_locs, col_locs)) for i, j in locs: - formatter = _maybe_wrap_formatter(formatter) self._display_funcs[(i, j)] = formatter return self @@ -553,6 +574,7 @@ def _copy(self, deepcopy=False): caption=self.caption, uuid=self.uuid, table_styles=self.table_styles, + na_rep=self.na_rep, ) if deepcopy: styler.ctx = copy.deepcopy(self.ctx) @@ -896,6 +918,23 @@ def set_table_styles(self, table_styles): self.table_styles = table_styles return self + def set_na_rep(self, na_rep: str) -> "Styler": + """ + Set the missing data representation on a Styler. + + .. versionadded:: 1.0.0 + + Parameters + ---------- + na_rep : str + + Returns + ------- + self : Styler + """ + self.na_rep = na_rep + return self + def hide_index(self): """ Hide any indices from rendering. @@ -1487,14 +1526,22 @@ def _get_level_lengths(index, hidden_elements=None): return non_zero_lengths -def _maybe_wrap_formatter(formatter): +def _maybe_wrap_formatter(formatter, na_rep: Optional[str]): if isinstance(formatter, str): - return lambda x: formatter.format(x) + formatter_func = lambda x: formatter.format(x) elif callable(formatter): - return formatter + formatter_func = formatter else: msg = ( "Expected a template string or callable, got {formatter} " "instead".format(formatter=formatter) ) raise TypeError(msg) + + if na_rep is None: + return formatter_func + elif isinstance(na_rep, str): + return lambda x: na_rep if pd.isna(x) else formatter_func(x) + else: + msg = "Expected a string, got {na_rep} instead".format(na_rep=na_rep) + raise TypeError(msg) diff --git a/pandas/tests/io/formats/test_style.py b/pandas/tests/io/formats/test_style.py index 61a3934187bd3..5a3afb5025e51 100644 --- a/pandas/tests/io/formats/test_style.py +++ b/pandas/tests/io/formats/test_style.py @@ -1009,6 +1009,75 @@ def test_bar_bad_align_raises(self): with pytest.raises(ValueError): df.style.bar(align="poorly", color=["#d65f5f", "#5fba7d"]) + def test_format_with_na_rep(self): + # GH 21527 28358 + df = pd.DataFrame([[None, None], [1.1, 1.2]], columns=["A", "B"]) + + ctx = df.style.format(None, na_rep="-")._translate() + assert ctx["body"][0][1]["display_value"] == "-" + assert ctx["body"][0][2]["display_value"] == "-" + + ctx = df.style.format("{:.2%}", na_rep="-")._translate() + assert ctx["body"][0][1]["display_value"] == "-" + assert ctx["body"][0][2]["display_value"] == "-" + assert ctx["body"][1][1]["display_value"] == "110.00%" + assert ctx["body"][1][2]["display_value"] == "120.00%" + + ctx = df.style.format("{:.2%}", na_rep="-", subset=["B"])._translate() + assert ctx["body"][0][2]["display_value"] == "-" + assert ctx["body"][1][2]["display_value"] == "120.00%" + + def test_init_with_na_rep(self): + # GH 21527 28358 + df = pd.DataFrame([[None, None], [1.1, 1.2]], columns=["A", "B"]) + + ctx = Styler(df, na_rep="NA")._translate() + assert ctx["body"][0][1]["display_value"] == "NA" + assert ctx["body"][0][2]["display_value"] == "NA" + + def test_set_na_rep(self): + # GH 21527 28358 + df = pd.DataFrame([[None, None], [1.1, 1.2]], columns=["A", "B"]) + + ctx = df.style.set_na_rep("NA")._translate() + assert ctx["body"][0][1]["display_value"] == "NA" + assert ctx["body"][0][2]["display_value"] == "NA" + + ctx = ( + df.style.set_na_rep("NA") + .format(None, na_rep="-", subset=["B"]) + ._translate() + ) + assert ctx["body"][0][1]["display_value"] == "NA" + assert ctx["body"][0][2]["display_value"] == "-" + + def test_format_non_numeric_na(self): + # GH 21527 28358 + df = pd.DataFrame( + { + "object": [None, np.nan, "foo"], + "datetime": [None, pd.NaT, pd.Timestamp("20120101")], + } + ) + + ctx = df.style.set_na_rep("NA")._translate() + assert ctx["body"][0][1]["display_value"] == "NA" + assert ctx["body"][0][2]["display_value"] == "NA" + assert ctx["body"][1][1]["display_value"] == "NA" + assert ctx["body"][1][2]["display_value"] == "NA" + + ctx = df.style.format(None, na_rep="-")._translate() + assert ctx["body"][0][1]["display_value"] == "-" + assert ctx["body"][0][2]["display_value"] == "-" + assert ctx["body"][1][1]["display_value"] == "-" + assert ctx["body"][1][2]["display_value"] == "-" + + def test_format_with_bad_na_rep(self): + # GH 21527 28358 + df = pd.DataFrame([[None, None], [1.1, 1.2]], columns=["A", "B"]) + with pytest.raises(TypeError): + df.style.format(None, na_rep=-1) + def test_highlight_null(self, null_color="red"): df = pd.DataFrame({"A": [0, np.nan]}) result = df.style.highlight_null()._compute().ctx From 11cb42346a1593af9ad04381a1b10b71d9256015 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Mon, 25 Nov 2019 08:03:55 -0600 Subject: [PATCH 184/185] DOC: Add link to dev calendar and meeting notes (#29737) --- doc/source/development/index.rst | 1 + doc/source/development/meeting.rst | 32 ++++++++++++++++++++++++++++++ 2 files changed, 33 insertions(+) create mode 100644 doc/source/development/meeting.rst diff --git a/doc/source/development/index.rst b/doc/source/development/index.rst index a523ae0c957f1..757b197c717e6 100644 --- a/doc/source/development/index.rst +++ b/doc/source/development/index.rst @@ -19,3 +19,4 @@ Development developer policies roadmap + meeting diff --git a/doc/source/development/meeting.rst b/doc/source/development/meeting.rst new file mode 100644 index 0000000000000..1d19408692cda --- /dev/null +++ b/doc/source/development/meeting.rst @@ -0,0 +1,32 @@ +.. _meeting: + +================== +Developer Meetings +================== + +We hold regular developer meetings on the second Wednesday +of each month at 18:00 UTC. These meetings and their minutes are open to +the public. All are welcome to join. + +Minutes +------- + +The minutes of past meetings are available in `this Google Document `__. + +Calendar +-------- + +This calendar shows all the developer meetings. + +.. raw:: html + + + +You can subscribe to this calendar with the following links: + +* `iCal `__ +* `Google calendar `__ + +Additionally, we'll sometimes have one-off meetings on specific topics. +These will be published on the same calendar. + From 7d7f885856b0c3d51eaf15beaac9d4f30c23797d Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Mon, 25 Nov 2019 16:49:48 +0100 Subject: [PATCH 185/185] ENH: add BooleanArray extension array (#29555) --- doc/source/getting_started/basics.rst | 1 + doc/source/reference/arrays.rst | 23 + doc/source/whatsnew/v1.0.0.rst | 24 + pandas/__init__.py | 1 + pandas/arrays/__init__.py | 2 + pandas/conftest.py | 14 + pandas/core/api.py | 1 + pandas/core/arrays/__init__.py | 1 + pandas/core/arrays/base.py | 9 + pandas/core/arrays/boolean.py | 745 +++++++++++++++++++++++++ pandas/core/dtypes/missing.py | 2 +- pandas/tests/api/test_api.py | 1 + pandas/tests/arrays/test_boolean.py | 509 +++++++++++++++++ pandas/tests/dtypes/test_common.py | 3 + pandas/tests/extension/test_boolean.py | 333 +++++++++++ 15 files changed, 1668 insertions(+), 1 deletion(-) create mode 100644 pandas/core/arrays/boolean.py create mode 100644 pandas/tests/arrays/test_boolean.py create mode 100644 pandas/tests/extension/test_boolean.py diff --git a/doc/source/getting_started/basics.rst b/doc/source/getting_started/basics.rst index 125990f7cadcd..6301fee7775cf 100644 --- a/doc/source/getting_started/basics.rst +++ b/doc/source/getting_started/basics.rst @@ -1950,6 +1950,7 @@ sparse :class:`SparseDtype` (none) :class:`arrays. intervals :class:`IntervalDtype` :class:`Interval` :class:`arrays.IntervalArray` :ref:`advanced.intervalindex` nullable integer :class:`Int64Dtype`, ... (none) :class:`arrays.IntegerArray` :ref:`integer_na` Strings :class:`StringDtype` :class:`str` :class:`arrays.StringArray` :ref:`text` +Boolean (with NA) :class:`BooleanDtype` :class:`bool` :class:`arrays.BooleanArray` :ref:`api.arrays.bool` =================== ========================= ================== ============================= ============================= Pandas has two ways to store strings. diff --git a/doc/source/reference/arrays.rst b/doc/source/reference/arrays.rst index 0c435e06ac57f..cf14d28772f4c 100644 --- a/doc/source/reference/arrays.rst +++ b/doc/source/reference/arrays.rst @@ -25,6 +25,7 @@ Nullable Integer :class:`Int64Dtype`, ... (none) :ref:`api.array Categorical :class:`CategoricalDtype` (none) :ref:`api.arrays.categorical` Sparse :class:`SparseDtype` (none) :ref:`api.arrays.sparse` Strings :class:`StringDtype` :class:`str` :ref:`api.arrays.string` +Boolean (with NA) :class:`BooleanDtype` :class:`bool` :ref:`api.arrays.bool` =================== ========================= ================== ============================= Pandas and third-party libraries can extend NumPy's type system (see :ref:`extending.extension-types`). @@ -485,6 +486,28 @@ The ``Series.str`` accessor is available for ``Series`` backed by a :class:`arra See :ref:`api.series.str` for more. +.. _api.arrays.bool: + +Boolean data with missing values +-------------------------------- + +The boolean dtype (with the alias ``"boolean"``) provides support for storing +boolean data (True, False values) with missing values, which is not possible +with a bool :class:`numpy.ndarray`. + +.. autosummary:: + :toctree: api/ + :template: autosummary/class_without_autosummary.rst + + arrays.BooleanArray + +.. autosummary:: + :toctree: api/ + :template: autosummary/class_without_autosummary.rst + + BooleanDtype + + .. Dtype attributes which are manually listed in their docstrings: including .. it here to make sure a docstring page is built for them diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index 3990eec2435d9..7d11d90eeb670 100644 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -102,6 +102,30 @@ String accessor methods returning integers will return a value with :class:`Int6 We recommend explicitly using the ``string`` data type when working with strings. See :ref:`text.types` for more. +.. _whatsnew_100.boolean: + +Boolean data type with missing values support +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +We've added :class:`BooleanDtype` / :class:`~arrays.BooleanArray`, an extension +type dedicated to boolean data that can hold missing values. With the default +``'bool`` data type based on a numpy bool array, the column can only hold +True or False values and not missing values. This new :class:`BooleanDtype` +can store missing values as well by keeping track of this in a separate mask. +(:issue:`29555`) + +.. ipython:: python + + pd.Series([True, False, None], dtype=pd.BooleanDtype()) + +You can use the alias ``"boolean"`` as well. + +.. ipython:: python + + s = pd.Series([True, False, None], dtype="boolean") + s + + .. _whatsnew_1000.enhancements.other: Other enhancements diff --git a/pandas/__init__.py b/pandas/__init__.py index 5d163e411c0ac..cd697b757a26a 100644 --- a/pandas/__init__.py +++ b/pandas/__init__.py @@ -67,6 +67,7 @@ IntervalDtype, DatetimeTZDtype, StringDtype, + BooleanDtype, # missing isna, isnull, diff --git a/pandas/arrays/__init__.py b/pandas/arrays/__init__.py index 9870b5bed076d..61832a8b6d621 100644 --- a/pandas/arrays/__init__.py +++ b/pandas/arrays/__init__.py @@ -4,6 +4,7 @@ See :ref:`extending.extension-types` for more. """ from pandas.core.arrays import ( + BooleanArray, Categorical, DatetimeArray, IntegerArray, @@ -16,6 +17,7 @@ ) __all__ = [ + "BooleanArray", "Categorical", "DatetimeArray", "IntegerArray", diff --git a/pandas/conftest.py b/pandas/conftest.py index b032e14d8f7e1..78e5b5e12b7e9 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -293,6 +293,20 @@ def compare_operators_no_eq_ne(request): return request.param +@pytest.fixture( + params=["__and__", "__rand__", "__or__", "__ror__", "__xor__", "__rxor__"] +) +def all_logical_operators(request): + """ + Fixture for dunder names for common logical operations + + * | + * & + * ^ + """ + return request.param + + @pytest.fixture(params=[None, "gzip", "bz2", "zip", "xz"]) def compression(request): """ diff --git a/pandas/core/api.py b/pandas/core/api.py index 7df2165201a99..65f0178b19187 100644 --- a/pandas/core/api.py +++ b/pandas/core/api.py @@ -12,6 +12,7 @@ from pandas.core.algorithms import factorize, unique, value_counts from pandas.core.arrays import Categorical +from pandas.core.arrays.boolean import BooleanDtype from pandas.core.arrays.integer import ( Int8Dtype, Int16Dtype, diff --git a/pandas/core/arrays/__init__.py b/pandas/core/arrays/__init__.py index 03d998707c26b..df26cd94b5ed9 100644 --- a/pandas/core/arrays/__init__.py +++ b/pandas/core/arrays/__init__.py @@ -4,6 +4,7 @@ ExtensionScalarOpsMixin, try_cast_to_ea, ) +from .boolean import BooleanArray # noqa: F401 from .categorical import Categorical # noqa: F401 from .datetimes import DatetimeArray # noqa: F401 from .integer import IntegerArray, integer_array # noqa: F401 diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index fa0e025c22c88..a444a4e46d0d7 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -1088,6 +1088,15 @@ def _add_comparison_ops(cls): cls.__le__ = cls._create_comparison_method(operator.le) cls.__ge__ = cls._create_comparison_method(operator.ge) + @classmethod + def _add_logical_ops(cls): + cls.__and__ = cls._create_logical_method(operator.and_) + cls.__rand__ = cls._create_logical_method(ops.rand_) + cls.__or__ = cls._create_logical_method(operator.or_) + cls.__ror__ = cls._create_logical_method(ops.ror_) + cls.__xor__ = cls._create_logical_method(operator.xor) + cls.__rxor__ = cls._create_logical_method(ops.rxor) + class ExtensionScalarOpsMixin(ExtensionOpsMixin): """ diff --git a/pandas/core/arrays/boolean.py b/pandas/core/arrays/boolean.py new file mode 100644 index 0000000000000..c118b6fe26549 --- /dev/null +++ b/pandas/core/arrays/boolean.py @@ -0,0 +1,745 @@ +import numbers +from typing import TYPE_CHECKING, Type +import warnings + +import numpy as np + +from pandas._libs import lib +from pandas.compat import set_function_name + +from pandas.core.dtypes.base import ExtensionDtype +from pandas.core.dtypes.cast import astype_nansafe +from pandas.core.dtypes.common import ( + is_bool_dtype, + is_extension_array_dtype, + is_float, + is_float_dtype, + is_integer, + is_integer_dtype, + is_list_like, + is_scalar, + pandas_dtype, +) +from pandas.core.dtypes.dtypes import register_extension_dtype +from pandas.core.dtypes.generic import ABCDataFrame, ABCIndexClass, ABCSeries +from pandas.core.dtypes.missing import isna, notna + +from pandas.core import nanops, ops +from pandas.core.algorithms import take +from pandas.core.arrays import ExtensionArray, ExtensionOpsMixin + +if TYPE_CHECKING: + from pandas._typing import Scalar + + +@register_extension_dtype +class BooleanDtype(ExtensionDtype): + """ + Extension dtype for boolean data. + + .. versionadded:: 1.0.0 + + .. warning:: + + BooleanDtype is considered experimental. The implementation and + parts of the API may change without warning. + + Attributes + ---------- + None + + Methods + ------- + None + + Examples + -------- + >>> pd.BooleanDtype() + BooleanDtype + """ + + @property + def na_value(self) -> "Scalar": + """ + BooleanDtype uses :attr:`numpy.nan` as the missing NA value. + + .. warning:: + + `na_value` may change in a future release. + """ + return np.nan + + @property + def type(self) -> Type: + return np.bool_ + + @property + def kind(self) -> str: + return "b" + + @property + def name(self) -> str: + """ + The alias for BooleanDtype is ``'boolean'``. + """ + return "boolean" + + @classmethod + def construct_from_string(cls, string: str) -> ExtensionDtype: + if string == "boolean": + return cls() + return super().construct_from_string(string) + + @classmethod + def construct_array_type(cls) -> "Type[BooleanArray]": + return BooleanArray + + def __repr__(self) -> str: + return "BooleanDtype" + + @property + def _is_boolean(self) -> bool: + return True + + +def coerce_to_array(values, mask=None, copy: bool = False): + """ + Coerce the input values array to numpy arrays with a mask. + + Parameters + ---------- + values : 1D list-like + mask : bool 1D array, optional + copy : bool, default False + if True, copy the input + + Returns + ------- + tuple of (values, mask) + """ + if isinstance(values, BooleanArray): + if mask is not None: + raise ValueError("cannot pass mask for BooleanArray input") + values, mask = values._data, values._mask + if copy: + values = values.copy() + mask = mask.copy() + return values, mask + + mask_values = None + if isinstance(values, np.ndarray) and values.dtype == np.bool_: + if copy: + values = values.copy() + else: + # TODO conversion from integer/float ndarray can be done more efficiently + # (avoid roundtrip through object) + values_object = np.asarray(values, dtype=object) + + inferred_dtype = lib.infer_dtype(values_object, skipna=True) + integer_like = ("floating", "integer", "mixed-integer-float") + if inferred_dtype not in ("boolean", "empty") + integer_like: + raise TypeError("Need to pass bool-like values") + + mask_values = isna(values_object) + values = np.zeros(len(values), dtype=bool) + values[~mask_values] = values_object[~mask_values].astype(bool) + + # if the values were integer-like, validate it were actually 0/1's + if inferred_dtype in integer_like: + if not np.all( + values[~mask_values].astype(float) + == values_object[~mask_values].astype(float) + ): + raise TypeError("Need to pass bool-like values") + + if mask is None and mask_values is None: + mask = np.zeros(len(values), dtype=bool) + elif mask is None: + mask = mask_values + else: + if isinstance(mask, np.ndarray) and mask.dtype == np.bool_: + if mask_values is not None: + mask = mask | mask_values + else: + if copy: + mask = mask.copy() + else: + mask = np.array(mask, dtype=bool) + if mask_values is not None: + mask = mask | mask_values + + if not values.ndim == 1: + raise ValueError("values must be a 1D list-like") + if not mask.ndim == 1: + raise ValueError("mask must be a 1D list-like") + + return values, mask + + +class BooleanArray(ExtensionArray, ExtensionOpsMixin): + """ + Array of boolean (True/False) data with missing values. + + This is a pandas Extension array for boolean data, under the hood + represented by 2 numpy arrays: a boolean array with the data and + a boolean array with the mask (True indicating missing). + + To construct an BooleanArray from generic array-like input, use + :func:`pandas.array` specifying ``dtype="boolean"`` (see examples + below). + + .. versionadded:: 1.0.0 + + .. warning:: + + BooleanArray is considered experimental. The implementation and + parts of the API may change without warning. + + Parameters + ---------- + values : numpy.ndarray + A 1-d boolean-dtype array with the data. + mask : numpy.ndarray + A 1-d boolean-dtype array indicating missing values (True + indicates missing). + copy : bool, default False + Whether to copy the `values` and `mask` arrays. + + Attributes + ---------- + None + + Methods + ------- + None + + Returns + ------- + BooleanArray + + Examples + -------- + Create an BooleanArray with :func:`pandas.array`: + + >>> pd.array([True, False, None], dtype="boolean") + + [True, False, NaN] + Length: 3, dtype: boolean + """ + + def __init__(self, values: np.ndarray, mask: np.ndarray, copy: bool = False): + if not (isinstance(values, np.ndarray) and values.dtype == np.bool_): + raise TypeError( + "values should be boolean numpy array. Use " + "the 'array' function instead" + ) + if not (isinstance(mask, np.ndarray) and mask.dtype == np.bool_): + raise TypeError( + "mask should be boolean numpy array. Use " + "the 'array' function instead" + ) + if not values.ndim == 1: + raise ValueError("values must be a 1D array") + if not mask.ndim == 1: + raise ValueError("mask must be a 1D array") + + if copy: + values = values.copy() + mask = mask.copy() + + self._data = values + self._mask = mask + self._dtype = BooleanDtype() + + @property + def dtype(self): + return self._dtype + + @classmethod + def _from_sequence(cls, scalars, dtype=None, copy: bool = False): + if dtype: + assert dtype == "boolean" + values, mask = coerce_to_array(scalars, copy=copy) + return BooleanArray(values, mask) + + @classmethod + def _from_factorized(cls, values, original: "BooleanArray"): + return cls._from_sequence(values, dtype=original.dtype) + + def _formatter(self, boxed=False): + def fmt(x): + if isna(x): + return "NaN" + return str(x) + + return fmt + + def __getitem__(self, item): + if is_integer(item): + if self._mask[item]: + return self.dtype.na_value + return self._data[item] + return type(self)(self._data[item], self._mask[item]) + + def _coerce_to_ndarray(self, force_bool: bool = False): + """ + Coerce to an ndarary of object dtype or bool dtype (if force_bool=True). + + Parameters + ---------- + force_bool : bool, default False + If True, return bool array or raise error if not possible (in + presence of missing values) + """ + if force_bool: + if not self.isna().any(): + return self._data + else: + raise ValueError( + "cannot convert to bool numpy array in presence of missing values" + ) + data = self._data.astype(object) + data[self._mask] = self._na_value + return data + + __array_priority__ = 1000 # higher than ndarray so ops dispatch to us + + def __array__(self, dtype=None): + """ + the array interface, return my values + We return an object array here to preserve our scalar values + """ + if dtype is not None: + if is_bool_dtype(dtype): + return self._coerce_to_ndarray(force_bool=True) + # TODO can optimize this to not go through object dtype for + # numeric dtypes + arr = self._coerce_to_ndarray() + return arr.astype(dtype, copy=False) + # by default (no dtype specified), return an object array + return self._coerce_to_ndarray() + + def __arrow_array__(self, type=None): + """ + Convert myself into a pyarrow Array. + """ + import pyarrow as pa + + return pa.array(self._data, mask=self._mask, type=type) + + _HANDLED_TYPES = (np.ndarray, numbers.Number, bool, np.bool_) + + def __array_ufunc__(self, ufunc, method, *inputs, **kwargs): + # For BooleanArray inputs, we apply the ufunc to ._data + # and mask the result. + if method == "reduce": + # Not clear how to handle missing values in reductions. Raise. + raise NotImplementedError("The 'reduce' method is not supported.") + out = kwargs.get("out", ()) + + for x in inputs + out: + if not isinstance(x, self._HANDLED_TYPES + (BooleanArray,)): + return NotImplemented + + # for binary ops, use our custom dunder methods + result = ops.maybe_dispatch_ufunc_to_dunder_op( + self, ufunc, method, *inputs, **kwargs + ) + if result is not NotImplemented: + return result + + mask = np.zeros(len(self), dtype=bool) + inputs2 = [] + for x in inputs: + if isinstance(x, BooleanArray): + mask |= x._mask + inputs2.append(x._data) + else: + inputs2.append(x) + + def reconstruct(x): + # we don't worry about scalar `x` here, since we + # raise for reduce up above. + + if is_bool_dtype(x.dtype): + m = mask.copy() + return BooleanArray(x, m) + else: + x[mask] = np.nan + return x + + result = getattr(ufunc, method)(*inputs2, **kwargs) + if isinstance(result, tuple): + tuple(reconstruct(x) for x in result) + else: + return reconstruct(result) + + def __iter__(self): + for i in range(len(self)): + if self._mask[i]: + yield self.dtype.na_value + else: + yield self._data[i] + + def take(self, indexer, allow_fill=False, fill_value=None): + # we always fill with False internally + # to avoid upcasting + data_fill_value = False if isna(fill_value) else fill_value + result = take( + self._data, indexer, fill_value=data_fill_value, allow_fill=allow_fill + ) + + mask = take(self._mask, indexer, fill_value=True, allow_fill=allow_fill) + + # if we are filling + # we only fill where the indexer is null + # not existing missing values + # TODO(jreback) what if we have a non-na float as a fill value? + if allow_fill and notna(fill_value): + fill_mask = np.asarray(indexer) == -1 + result[fill_mask] = fill_value + mask = mask ^ fill_mask + + return type(self)(result, mask, copy=False) + + def copy(self): + data, mask = self._data, self._mask + data = data.copy() + mask = mask.copy() + return type(self)(data, mask, copy=False) + + def __setitem__(self, key, value): + _is_scalar = is_scalar(value) + if _is_scalar: + value = [value] + value, mask = coerce_to_array(value) + + if _is_scalar: + value = value[0] + mask = mask[0] + + self._data[key] = value + self._mask[key] = mask + + def __len__(self): + return len(self._data) + + @property + def nbytes(self): + return self._data.nbytes + self._mask.nbytes + + def isna(self): + return self._mask + + @property + def _na_value(self): + return self._dtype.na_value + + @classmethod + def _concat_same_type(cls, to_concat): + data = np.concatenate([x._data for x in to_concat]) + mask = np.concatenate([x._mask for x in to_concat]) + return cls(data, mask) + + def astype(self, dtype, copy=True): + """ + Cast to a NumPy array or ExtensionArray with 'dtype'. + + Parameters + ---------- + dtype : str or dtype + Typecode or data-type to which the array is cast. + copy : bool, default True + Whether to copy the data, even if not necessary. If False, + a copy is made only if the old dtype does not match the + new dtype. + + Returns + ------- + array : ndarray or ExtensionArray + NumPy ndarray, BooleanArray or IntergerArray with 'dtype' for its dtype. + + Raises + ------ + TypeError + if incompatible type with an BooleanDtype, equivalent of same_kind + casting + """ + dtype = pandas_dtype(dtype) + + if isinstance(dtype, BooleanDtype): + values, mask = coerce_to_array(self, copy=copy) + return BooleanArray(values, mask, copy=False) + + if is_bool_dtype(dtype): + # astype_nansafe converts np.nan to True + if self.isna().any(): + raise ValueError("cannot convert float NaN to bool") + else: + return self._data.astype(dtype, copy=copy) + if is_extension_array_dtype(dtype) and is_integer_dtype(dtype): + from pandas.core.arrays import IntegerArray + + return IntegerArray( + self._data.astype(dtype.numpy_dtype), self._mask.copy(), copy=False + ) + # coerce + data = self._coerce_to_ndarray() + return astype_nansafe(data, dtype, copy=None) + + def value_counts(self, dropna=True): + """ + Returns a Series containing counts of each category. + + Every category will have an entry, even those with a count of 0. + + Parameters + ---------- + dropna : bool, default True + Don't include counts of NaN. + + Returns + ------- + counts : Series + + See Also + -------- + Series.value_counts + + """ + + from pandas import Index, Series + + # compute counts on the data with no nans + data = self._data[~self._mask] + value_counts = Index(data).value_counts() + array = value_counts.values + + # TODO(extension) + # if we have allow Index to hold an ExtensionArray + # this is easier + index = value_counts.index.values.astype(bool).astype(object) + + # if we want nans, count the mask + if not dropna: + + # TODO(extension) + # appending to an Index *always* infers + # w/o passing the dtype + array = np.append(array, [self._mask.sum()]) + index = Index( + np.concatenate([index, np.array([np.nan], dtype=object)]), dtype=object + ) + + return Series(array, index=index) + + def _values_for_argsort(self) -> np.ndarray: + """ + Return values for sorting. + + Returns + ------- + ndarray + The transformed values should maintain the ordering between values + within the array. + + See Also + -------- + ExtensionArray.argsort + """ + data = self._data.copy() + data[self._mask] = -1 + return data + + @classmethod + def _create_logical_method(cls, op): + def logical_method(self, other): + + if isinstance(other, (ABCDataFrame, ABCSeries, ABCIndexClass)): + # Rely on pandas to unbox and dispatch to us. + return NotImplemented + + other = lib.item_from_zerodim(other) + mask = None + + if isinstance(other, BooleanArray): + other, mask = other._data, other._mask + elif is_list_like(other): + other = np.asarray(other, dtype="bool") + if other.ndim > 1: + raise NotImplementedError( + "can only perform ops with 1-d structures" + ) + if len(self) != len(other): + raise ValueError("Lengths must match to compare") + other, mask = coerce_to_array(other, copy=False) + + # numpy will show a DeprecationWarning on invalid elementwise + # comparisons, this will raise in the future + with warnings.catch_warnings(): + warnings.filterwarnings("ignore", "elementwise", FutureWarning) + with np.errstate(all="ignore"): + result = op(self._data, other) + + # nans propagate + if mask is None: + mask = self._mask + else: + mask = self._mask | mask + + return BooleanArray(result, mask) + + name = "__{name}__".format(name=op.__name__) + return set_function_name(logical_method, name, cls) + + @classmethod + def _create_comparison_method(cls, op): + op_name = op.__name__ + + def cmp_method(self, other): + + if isinstance(other, (ABCDataFrame, ABCSeries, ABCIndexClass)): + # Rely on pandas to unbox and dispatch to us. + return NotImplemented + + other = lib.item_from_zerodim(other) + mask = None + + if isinstance(other, BooleanArray): + other, mask = other._data, other._mask + + elif is_list_like(other): + other = np.asarray(other) + if other.ndim > 1: + raise NotImplementedError( + "can only perform ops with 1-d structures" + ) + if len(self) != len(other): + raise ValueError("Lengths must match to compare") + + # numpy will show a DeprecationWarning on invalid elementwise + # comparisons, this will raise in the future + with warnings.catch_warnings(): + warnings.filterwarnings("ignore", "elementwise", FutureWarning) + with np.errstate(all="ignore"): + result = op(self._data, other) + + # nans propagate + if mask is None: + mask = self._mask + else: + mask = self._mask | mask + + result[mask] = op_name == "ne" + return BooleanArray(result, np.zeros(len(result), dtype=bool), copy=False) + + name = "__{name}__".format(name=op.__name__) + return set_function_name(cmp_method, name, cls) + + def _reduce(self, name, skipna=True, **kwargs): + data = self._data + mask = self._mask + + # coerce to a nan-aware float if needed + if mask.any(): + data = self._data.astype("float64") + data[mask] = self._na_value + + op = getattr(nanops, "nan" + name) + result = op(data, axis=0, skipna=skipna, mask=mask, **kwargs) + + # if we have a boolean op, don't coerce + if name in ["any", "all"]: + pass + + # if we have numeric op that would result in an int, coerce to int if possible + elif name in ["sum", "prod"] and notna(result): + int_result = np.int64(result) + if int_result == result: + result = int_result + + elif name in ["min", "max"] and notna(result): + result = np.bool_(result) + + return result + + def _maybe_mask_result(self, result, mask, other, op_name): + """ + Parameters + ---------- + result : array-like + mask : array-like bool + other : scalar or array-like + op_name : str + """ + # if we have a float operand we are by-definition + # a float result + # or our op is a divide + if (is_float_dtype(other) or is_float(other)) or ( + op_name in ["rtruediv", "truediv"] + ): + result[mask] = np.nan + return result + + if is_bool_dtype(result): + return BooleanArray(result, mask, copy=False) + + elif is_integer_dtype(result): + from pandas.core.arrays import IntegerArray + + return IntegerArray(result, mask, copy=False) + else: + result[mask] = np.nan + return result + + @classmethod + def _create_arithmetic_method(cls, op): + op_name = op.__name__ + + def boolean_arithmetic_method(self, other): + + if isinstance(other, (ABCDataFrame, ABCSeries, ABCIndexClass)): + # Rely on pandas to unbox and dispatch to us. + return NotImplemented + + other = lib.item_from_zerodim(other) + mask = None + + if isinstance(other, BooleanArray): + other, mask = other._data, other._mask + + elif is_list_like(other): + other = np.asarray(other) + if other.ndim > 1: + raise NotImplementedError( + "can only perform ops with 1-d structures" + ) + if len(self) != len(other): + raise ValueError("Lengths must match") + + # nans propagate + if mask is None: + mask = self._mask + else: + mask = self._mask | mask + + with np.errstate(all="ignore"): + result = op(self._data, other) + + # divmod returns a tuple + if op_name == "divmod": + div, mod = result + return ( + self._maybe_mask_result(div, mask, other, "floordiv"), + self._maybe_mask_result(mod, mask, other, "mod"), + ) + + return self._maybe_mask_result(result, mask, other, op_name) + + name = "__{name}__".format(name=op_name) + return set_function_name(boolean_arithmetic_method, name, cls) + + +BooleanArray._add_logical_ops() +BooleanArray._add_comparison_ops() +BooleanArray._add_arithmetic_ops() diff --git a/pandas/core/dtypes/missing.py b/pandas/core/dtypes/missing.py index aeba4eebc498e..25d6f87143d72 100644 --- a/pandas/core/dtypes/missing.py +++ b/pandas/core/dtypes/missing.py @@ -448,7 +448,7 @@ def array_equivalent(left, right, strict_nan: bool = False) -> bool: return False else: try: - if np.any(left_value != right_value): + if np.any(np.asarray(left_value != right_value)): return False except TypeError as err: if "Cannot compare tz-naive" in str(err): diff --git a/pandas/tests/api/test_api.py b/pandas/tests/api/test_api.py index 5d11e160bbd71..1282aa6edd538 100644 --- a/pandas/tests/api/test_api.py +++ b/pandas/tests/api/test_api.py @@ -80,6 +80,7 @@ class TestPDApi(Base): "PeriodDtype", "IntervalDtype", "DatetimeTZDtype", + "BooleanDtype", "Int8Dtype", "Int16Dtype", "Int32Dtype", diff --git a/pandas/tests/arrays/test_boolean.py b/pandas/tests/arrays/test_boolean.py new file mode 100644 index 0000000000000..5cfc7c3837875 --- /dev/null +++ b/pandas/tests/arrays/test_boolean.py @@ -0,0 +1,509 @@ +import operator + +import numpy as np +import pytest + +import pandas.util._test_decorators as td + +import pandas as pd +from pandas.arrays import BooleanArray +from pandas.core.arrays.boolean import coerce_to_array +from pandas.tests.extension.base import BaseOpsUtil +import pandas.util.testing as tm + + +def make_data(): + return [True, False] * 4 + [np.nan] + [True, False] * 44 + [np.nan] + [True, False] + + +@pytest.fixture +def dtype(): + return pd.BooleanDtype() + + +@pytest.fixture +def data(dtype): + return pd.array(make_data(), dtype=dtype) + + +def test_boolean_array_constructor(): + values = np.array([True, False, True, False], dtype="bool") + mask = np.array([False, False, False, True], dtype="bool") + + result = BooleanArray(values, mask) + expected = pd.array([True, False, True, None], dtype="boolean") + tm.assert_extension_array_equal(result, expected) + + with pytest.raises(TypeError, match="values should be boolean numpy array"): + BooleanArray(values.tolist(), mask) + + with pytest.raises(TypeError, match="mask should be boolean numpy array"): + BooleanArray(values, mask.tolist()) + + with pytest.raises(TypeError, match="values should be boolean numpy array"): + BooleanArray(values.astype(int), mask) + + with pytest.raises(TypeError, match="mask should be boolean numpy array"): + BooleanArray(values, None) + + with pytest.raises(ValueError, match="values must be a 1D array"): + BooleanArray(values.reshape(1, -1), mask) + + with pytest.raises(ValueError, match="mask must be a 1D array"): + BooleanArray(values, mask.reshape(1, -1)) + + +def test_boolean_array_constructor_copy(): + values = np.array([True, False, True, False], dtype="bool") + mask = np.array([False, False, False, True], dtype="bool") + + result = BooleanArray(values, mask) + assert result._data is values + assert result._mask is mask + + result = BooleanArray(values, mask, copy=True) + assert result._data is not values + assert result._mask is not mask + + +def test_to_boolean_array(): + expected = BooleanArray( + np.array([True, False, True]), np.array([False, False, False]) + ) + + result = pd.array([True, False, True], dtype="boolean") + tm.assert_extension_array_equal(result, expected) + result = pd.array(np.array([True, False, True]), dtype="boolean") + tm.assert_extension_array_equal(result, expected) + result = pd.array(np.array([True, False, True], dtype=object), dtype="boolean") + tm.assert_extension_array_equal(result, expected) + + # with missing values + expected = BooleanArray( + np.array([True, False, True]), np.array([False, False, True]) + ) + + result = pd.array([True, False, None], dtype="boolean") + tm.assert_extension_array_equal(result, expected) + result = pd.array(np.array([True, False, None], dtype=object), dtype="boolean") + tm.assert_extension_array_equal(result, expected) + + +def test_to_boolean_array_all_none(): + expected = BooleanArray(np.array([True, True, True]), np.array([True, True, True])) + + result = pd.array([None, None, None], dtype="boolean") + tm.assert_extension_array_equal(result, expected) + result = pd.array(np.array([None, None, None], dtype=object), dtype="boolean") + tm.assert_extension_array_equal(result, expected) + + +@pytest.mark.parametrize( + "a, b", + [ + ([True, None], [True, np.nan]), + ([None], [np.nan]), + ([None, np.nan], [np.nan, np.nan]), + ([np.nan, np.nan], [np.nan, np.nan]), + ], +) +def test_to_boolean_array_none_is_nan(a, b): + result = pd.array(a, dtype="boolean") + expected = pd.array(b, dtype="boolean") + tm.assert_extension_array_equal(result, expected) + + +@pytest.mark.parametrize( + "values", + [ + ["foo", "bar"], + ["1", "2"], + # "foo", + [1, 2], + [1.0, 2.0], + pd.date_range("20130101", periods=2), + np.array(["foo"]), + [np.nan, {"a": 1}], + ], +) +def test_to_boolean_array_error(values): + # error in converting existing arrays to BooleanArray + with pytest.raises(TypeError): + pd.array(values, dtype="boolean") + + +def test_to_boolean_array_integer_like(): + # integers of 0's and 1's + result = pd.array([1, 0, 1, 0], dtype="boolean") + expected = pd.array([True, False, True, False], dtype="boolean") + tm.assert_extension_array_equal(result, expected) + + result = pd.array(np.array([1, 0, 1, 0]), dtype="boolean") + tm.assert_extension_array_equal(result, expected) + + result = pd.array(np.array([1.0, 0.0, 1.0, 0.0]), dtype="boolean") + tm.assert_extension_array_equal(result, expected) + + # with missing values + result = pd.array([1, 0, 1, None], dtype="boolean") + expected = pd.array([True, False, True, None], dtype="boolean") + tm.assert_extension_array_equal(result, expected) + + result = pd.array(np.array([1.0, 0.0, 1.0, np.nan]), dtype="boolean") + tm.assert_extension_array_equal(result, expected) + + +def test_coerce_to_array(): + # TODO this is currently not public API + values = np.array([True, False, True, False], dtype="bool") + mask = np.array([False, False, False, True], dtype="bool") + result = BooleanArray(*coerce_to_array(values, mask=mask)) + expected = BooleanArray(values, mask) + tm.assert_extension_array_equal(result, expected) + assert result._data is values + assert result._mask is mask + result = BooleanArray(*coerce_to_array(values, mask=mask, copy=True)) + expected = BooleanArray(values, mask) + tm.assert_extension_array_equal(result, expected) + assert result._data is not values + assert result._mask is not mask + + # mixed missing from values and mask + values = [True, False, None, False] + mask = np.array([False, False, False, True], dtype="bool") + result = BooleanArray(*coerce_to_array(values, mask=mask)) + expected = BooleanArray( + np.array([True, False, True, True]), np.array([False, False, True, True]) + ) + tm.assert_extension_array_equal(result, expected) + result = BooleanArray(*coerce_to_array(np.array(values, dtype=object), mask=mask)) + tm.assert_extension_array_equal(result, expected) + result = BooleanArray(*coerce_to_array(values, mask=mask.tolist())) + tm.assert_extension_array_equal(result, expected) + + # raise errors for wrong dimension + values = np.array([True, False, True, False], dtype="bool") + mask = np.array([False, False, False, True], dtype="bool") + + with pytest.raises(ValueError, match="values must be a 1D list-like"): + coerce_to_array(values.reshape(1, -1)) + + with pytest.raises(ValueError, match="mask must be a 1D list-like"): + coerce_to_array(values, mask=mask.reshape(1, -1)) + + +def test_coerce_to_array_from_boolean_array(): + # passing BooleanArray to coerce_to_array + values = np.array([True, False, True, False], dtype="bool") + mask = np.array([False, False, False, True], dtype="bool") + arr = BooleanArray(values, mask) + result = BooleanArray(*coerce_to_array(arr)) + tm.assert_extension_array_equal(result, arr) + # no copy + assert result._data is arr._data + assert result._mask is arr._mask + + result = BooleanArray(*coerce_to_array(arr), copy=True) + tm.assert_extension_array_equal(result, arr) + assert result._data is not arr._data + assert result._mask is not arr._mask + + with pytest.raises(ValueError, match="cannot pass mask for BooleanArray input"): + coerce_to_array(arr, mask=mask) + + +def test_coerce_to_numpy_array(): + # with missing values -> object dtype + arr = pd.array([True, False, None], dtype="boolean") + result = np.array(arr) + expected = np.array([True, False, None], dtype="object") + tm.assert_numpy_array_equal(result, expected) + + # also with no missing values -> object dtype + arr = pd.array([True, False, True], dtype="boolean") + result = np.array(arr) + expected = np.array([True, False, True], dtype="object") + tm.assert_numpy_array_equal(result, expected) + + # force bool dtype + result = np.array(arr, dtype="bool") + expected = np.array([True, False, True], dtype="bool") + tm.assert_numpy_array_equal(result, expected) + # with missing values will raise error + arr = pd.array([True, False, None], dtype="boolean") + with pytest.raises(ValueError): + np.array(arr, dtype="bool") + + +def test_astype(): + # with missing values + arr = pd.array([True, False, None], dtype="boolean") + msg = "cannot convert float NaN to" + + with pytest.raises(ValueError, match=msg): + arr.astype("int64") + + with pytest.raises(ValueError, match=msg): + arr.astype("bool") + + result = arr.astype("float64") + expected = np.array([1, 0, np.nan], dtype="float64") + tm.assert_numpy_array_equal(result, expected) + + # no missing values + arr = pd.array([True, False, True], dtype="boolean") + result = arr.astype("int64") + expected = np.array([1, 0, 1], dtype="int64") + tm.assert_numpy_array_equal(result, expected) + + result = arr.astype("bool") + expected = np.array([True, False, True], dtype="bool") + tm.assert_numpy_array_equal(result, expected) + + +def test_astype_to_boolean_array(): + # astype to BooleanArray + arr = pd.array([True, False, None], dtype="boolean") + + result = arr.astype("boolean") + tm.assert_extension_array_equal(result, arr) + result = arr.astype(pd.BooleanDtype()) + tm.assert_extension_array_equal(result, arr) + + +def test_astype_to_integer_array(): + # astype to IntegerArray + arr = pd.array([True, False, None], dtype="boolean") + + result = arr.astype("Int64") + expected = pd.array([1, 0, None], dtype="Int64") + tm.assert_extension_array_equal(result, expected) + + +@pytest.mark.parametrize( + "ufunc", [np.add, np.logical_or, np.logical_and, np.logical_xor] +) +def test_ufuncs_binary(ufunc): + # two BooleanArrays + a = pd.array([True, False, None], dtype="boolean") + result = ufunc(a, a) + expected = pd.array(ufunc(a._data, a._data), dtype="boolean") + expected[a._mask] = np.nan + tm.assert_extension_array_equal(result, expected) + + s = pd.Series(a) + result = ufunc(s, a) + expected = pd.Series(ufunc(a._data, a._data), dtype="boolean") + expected[a._mask] = np.nan + tm.assert_series_equal(result, expected) + + # Boolean with numpy array + arr = np.array([True, True, False]) + result = ufunc(a, arr) + expected = pd.array(ufunc(a._data, arr), dtype="boolean") + expected[a._mask] = np.nan + tm.assert_extension_array_equal(result, expected) + + result = ufunc(arr, a) + expected = pd.array(ufunc(arr, a._data), dtype="boolean") + expected[a._mask] = np.nan + tm.assert_extension_array_equal(result, expected) + + # BooleanArray with scalar + result = ufunc(a, True) + expected = pd.array(ufunc(a._data, True), dtype="boolean") + expected[a._mask] = np.nan + tm.assert_extension_array_equal(result, expected) + + result = ufunc(True, a) + expected = pd.array(ufunc(True, a._data), dtype="boolean") + expected[a._mask] = np.nan + tm.assert_extension_array_equal(result, expected) + + # not handled types + with pytest.raises(TypeError): + ufunc(a, "test") + + +@pytest.mark.parametrize("ufunc", [np.logical_not]) +def test_ufuncs_unary(ufunc): + a = pd.array([True, False, None], dtype="boolean") + result = ufunc(a) + expected = pd.array(ufunc(a._data), dtype="boolean") + expected[a._mask] = np.nan + tm.assert_extension_array_equal(result, expected) + + s = pd.Series(a) + result = ufunc(s) + expected = pd.Series(ufunc(a._data), dtype="boolean") + expected[a._mask] = np.nan + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize("values", [[True, False], [True, None]]) +def test_ufunc_reduce_raises(values): + a = pd.array(values, dtype="boolean") + with pytest.raises(NotImplementedError): + np.add.reduce(a) + + +class TestLogicalOps(BaseOpsUtil): + def get_op_from_name(self, op_name): + short_opname = op_name.strip("_") + short_opname = short_opname if "xor" in short_opname else short_opname + "_" + try: + op = getattr(operator, short_opname) + except AttributeError: + # Assume it is the reverse operator + rop = getattr(operator, short_opname[1:]) + op = lambda x, y: rop(y, x) + + return op + + def _compare_other(self, data, op_name, other): + op = self.get_op_from_name(op_name) + + # array + result = pd.Series(op(data, other)) + expected = pd.Series(op(data._data, other), dtype="boolean") + + # fill the nan locations + expected[data._mask] = np.nan + + tm.assert_series_equal(result, expected) + + # series + s = pd.Series(data) + result = op(s, other) + + expected = pd.Series(data._data) + expected = op(expected, other) + expected = pd.Series(expected, dtype="boolean") + + # fill the nan locations + expected[data._mask] = np.nan + + tm.assert_series_equal(result, expected) + + def test_scalar(self, data, all_logical_operators): + op_name = all_logical_operators + self._compare_other(data, op_name, True) + + def test_array(self, data, all_logical_operators): + op_name = all_logical_operators + other = pd.array([True] * len(data), dtype="boolean") + self._compare_other(data, op_name, other) + other = np.array([True] * len(data)) + self._compare_other(data, op_name, other) + other = pd.Series([True] * len(data), dtype="boolean") + self._compare_other(data, op_name, other) + + +class TestComparisonOps(BaseOpsUtil): + def _compare_other(self, data, op_name, other): + op = self.get_op_from_name(op_name) + + # array + result = pd.Series(op(data, other)) + expected = pd.Series(op(data._data, other), dtype="boolean") + + # fill the nan locations + expected[data._mask] = op_name == "__ne__" + + tm.assert_series_equal(result, expected) + + # series + s = pd.Series(data) + result = op(s, other) + + expected = pd.Series(data._data) + expected = op(expected, other) + expected = expected.astype("boolean") + + # fill the nan locations + expected[data._mask] = op_name == "__ne__" + + tm.assert_series_equal(result, expected) + + def test_compare_scalar(self, data, all_compare_operators): + op_name = all_compare_operators + self._compare_other(data, op_name, True) + + def test_compare_array(self, data, all_compare_operators): + op_name = all_compare_operators + other = pd.array([True] * len(data), dtype="boolean") + self._compare_other(data, op_name, other) + other = np.array([True] * len(data)) + self._compare_other(data, op_name, other) + other = pd.Series([True] * len(data)) + self._compare_other(data, op_name, other) + + +class TestArithmeticOps(BaseOpsUtil): + def test_error(self, data, all_arithmetic_operators): + # invalid ops + + op = all_arithmetic_operators + s = pd.Series(data) + ops = getattr(s, op) + opa = getattr(data, op) + + # invalid scalars + with pytest.raises(TypeError): + ops("foo") + with pytest.raises(TypeError): + ops(pd.Timestamp("20180101")) + + # invalid array-likes + if op not in ("__mul__", "__rmul__"): + # TODO(extension) numpy's mul with object array sees booleans as numbers + with pytest.raises(TypeError): + ops(pd.Series("foo", index=s.index)) + + # 2d + result = opa(pd.DataFrame({"A": s})) + assert result is NotImplemented + + with pytest.raises(NotImplementedError): + opa(np.arange(len(s)).reshape(-1, len(s))) + + +@pytest.mark.parametrize("dropna", [True, False]) +def test_reductions_return_types(dropna, data, all_numeric_reductions): + op = all_numeric_reductions + s = pd.Series(data) + if dropna: + s = s.dropna() + + if op in ("sum", "prod"): + assert isinstance(getattr(s, op)(), np.int64) + elif op in ("min", "max"): + assert isinstance(getattr(s, op)(), np.bool_) + else: + # "mean", "std", "var", "median", "kurt", "skew" + assert isinstance(getattr(s, op)(), np.float64) + + +# TODO when BooleanArray coerces to object dtype numpy array, need to do conversion +# manually in the indexing code +# def test_indexing_boolean_mask(): +# arr = pd.array([1, 2, 3, 4], dtype="Int64") +# mask = pd.array([True, False, True, False], dtype="boolean") +# result = arr[mask] +# expected = pd.array([1, 3], dtype="Int64") +# tm.assert_extension_array_equal(result, expected) + +# # missing values -> error +# mask = pd.array([True, False, True, None], dtype="boolean") +# with pytest.raises(IndexError): +# result = arr[mask] + + +@td.skip_if_no("pyarrow", min_version="0.15.0") +def test_arrow_array(data): + # protocol added in 0.15.0 + import pyarrow as pa + + arr = pa.array(data) + expected = pa.array(np.array(data, dtype=object), type=pa.bool_(), from_pandas=True) + assert arr.equals(expected) diff --git a/pandas/tests/dtypes/test_common.py b/pandas/tests/dtypes/test_common.py index 6d91d13027f69..912fce6339716 100644 --- a/pandas/tests/dtypes/test_common.py +++ b/pandas/tests/dtypes/test_common.py @@ -529,6 +529,9 @@ def test_is_bool_dtype(): assert com.is_bool_dtype(np.array([True, False])) assert com.is_bool_dtype(pd.Index([True, False])) + assert com.is_bool_dtype(pd.BooleanDtype()) + assert com.is_bool_dtype(pd.array([True, False, None], dtype="boolean")) + @pytest.mark.filterwarnings("ignore:'is_extension_type' is deprecated:FutureWarning") @pytest.mark.parametrize( diff --git a/pandas/tests/extension/test_boolean.py b/pandas/tests/extension/test_boolean.py new file mode 100644 index 0000000000000..089dd798b2512 --- /dev/null +++ b/pandas/tests/extension/test_boolean.py @@ -0,0 +1,333 @@ +""" +This file contains a minimal set of tests for compliance with the extension +array interface test suite, and should contain no other tests. +The test suite for the full functionality of the array is located in +`pandas/tests/arrays/`. + +The tests in this file are inherited from the BaseExtensionTests, and only +minimal tweaks should be applied to get the tests passing (by overwriting a +parent method). + +Additional tests should either be added to one of the BaseExtensionTests +classes (if they are relevant for the extension interface for all dtypes), or +be added to the array-specific tests in `pandas/tests/arrays/`. + +""" +import numpy as np +import pytest + +from pandas.compat.numpy import _np_version_under1p14 + +import pandas as pd +from pandas.core.arrays.boolean import BooleanDtype +from pandas.tests.extension import base +import pandas.util.testing as tm + + +def make_data(): + return [True, False] * 4 + [np.nan] + [True, False] * 44 + [np.nan] + [True, False] + + +@pytest.fixture +def dtype(): + return BooleanDtype() + + +@pytest.fixture +def data(dtype): + return pd.array(make_data(), dtype=dtype) + + +@pytest.fixture +def data_for_twos(dtype): + return pd.array(np.ones(100), dtype=dtype) + + +@pytest.fixture +def data_missing(dtype): + return pd.array([np.nan, True], dtype=dtype) + + +@pytest.fixture +def data_for_sorting(dtype): + return pd.array([True, True, False], dtype=dtype) + + +@pytest.fixture +def data_missing_for_sorting(dtype): + return pd.array([True, np.nan, False], dtype=dtype) + + +@pytest.fixture +def na_cmp(): + # we are np.nan + return lambda x, y: np.isnan(x) and np.isnan(y) + + +@pytest.fixture +def na_value(): + return np.nan + + +@pytest.fixture +def data_for_grouping(dtype): + b = True + a = False + na = np.nan + return pd.array([b, b, na, na, a, a, b], dtype=dtype) + + +class TestDtype(base.BaseDtypeTests): + pass + + +class TestInterface(base.BaseInterfaceTests): + pass + + +class TestConstructors(base.BaseConstructorsTests): + pass + + +class TestGetitem(base.BaseGetitemTests): + pass + + +class TestSetitem(base.BaseSetitemTests): + pass + + +class TestMissing(base.BaseMissingTests): + pass + + +class TestArithmeticOps(base.BaseArithmeticOpsTests): + def check_opname(self, s, op_name, other, exc=None): + # overwriting to indicate ops don't raise an error + super().check_opname(s, op_name, other, exc=None) + + def _check_op(self, s, op, other, op_name, exc=NotImplementedError): + if exc is None: + if op_name in ("__sub__", "__rsub__"): + # subtraction for bools raises TypeError (but not yet in 1.13) + if _np_version_under1p14: + pytest.skip("__sub__ does not yet raise in numpy 1.13") + with pytest.raises(TypeError): + op(s, other) + + return + + result = op(s, other) + expected = s.combine(other, op) + + if op_name in ( + "__floordiv__", + "__rfloordiv__", + "__pow__", + "__rpow__", + "__mod__", + "__rmod__", + ): + # combine keeps boolean type + expected = expected.astype("Int8") + elif op_name in ("__truediv__", "__rtruediv__"): + # combine with bools does not generate the correct result + # (numpy behaviour for div is to regard the bools as numeric) + expected = s.astype(float).combine(other, op) + if op_name == "__rpow__": + # for rpow, combine does not propagate NaN + expected[result.isna()] = np.nan + self.assert_series_equal(result, expected) + else: + with pytest.raises(exc): + op(s, other) + + def _check_divmod_op(self, s, op, other, exc=None): + # override to not raise an error + super()._check_divmod_op(s, op, other, None) + + @pytest.mark.skip(reason="BooleanArray does not error on ops") + def test_error(self, data, all_arithmetic_operators): + # other specific errors tested in the boolean array specific tests + pass + + +class TestComparisonOps(base.BaseComparisonOpsTests): + def check_opname(self, s, op_name, other, exc=None): + # overwriting to indicate ops don't raise an error + super().check_opname(s, op_name, other, exc=None) + + def _compare_other(self, s, data, op_name, other): + self.check_opname(s, op_name, other) + + +class TestReshaping(base.BaseReshapingTests): + pass + + +class TestMethods(base.BaseMethodsTests): + @pytest.mark.parametrize("na_sentinel", [-1, -2]) + def test_factorize(self, data_for_grouping, na_sentinel): + # override because we only have 2 unique values + labels, uniques = pd.factorize(data_for_grouping, na_sentinel=na_sentinel) + expected_labels = np.array( + [0, 0, na_sentinel, na_sentinel, 1, 1, 0], dtype=np.intp + ) + expected_uniques = data_for_grouping.take([0, 4]) + + tm.assert_numpy_array_equal(labels, expected_labels) + self.assert_extension_array_equal(uniques, expected_uniques) + + def test_combine_le(self, data_repeated): + # override because expected needs to be boolean instead of bool dtype + orig_data1, orig_data2 = data_repeated(2) + s1 = pd.Series(orig_data1) + s2 = pd.Series(orig_data2) + result = s1.combine(s2, lambda x1, x2: x1 <= x2) + expected = pd.Series( + [a <= b for (a, b) in zip(list(orig_data1), list(orig_data2))], + dtype="boolean", + ) + self.assert_series_equal(result, expected) + + val = s1.iloc[0] + result = s1.combine(val, lambda x1, x2: x1 <= x2) + expected = pd.Series([a <= val for a in list(orig_data1)], dtype="boolean") + self.assert_series_equal(result, expected) + + def test_searchsorted(self, data_for_sorting, as_series): + # override because we only have 2 unique values + data_for_sorting = pd.array([True, False], dtype="boolean") + b, a = data_for_sorting + arr = type(data_for_sorting)._from_sequence([a, b]) + + if as_series: + arr = pd.Series(arr) + assert arr.searchsorted(a) == 0 + assert arr.searchsorted(a, side="right") == 1 + + assert arr.searchsorted(b) == 1 + assert arr.searchsorted(b, side="right") == 2 + + result = arr.searchsorted(arr.take([0, 1])) + expected = np.array([0, 1], dtype=np.intp) + + tm.assert_numpy_array_equal(result, expected) + + # sorter + sorter = np.array([1, 0]) + assert data_for_sorting.searchsorted(a, sorter=sorter) == 0 + + +class TestCasting(base.BaseCastingTests): + pass + + +class TestGroupby(base.BaseGroupbyTests): + """ + Groupby-specific tests are overridden because boolean only has 2 + unique values, base tests uses 3 groups. + """ + + def test_grouping_grouper(self, data_for_grouping): + df = pd.DataFrame( + {"A": ["B", "B", None, None, "A", "A", "B"], "B": data_for_grouping} + ) + gr1 = df.groupby("A").grouper.groupings[0] + gr2 = df.groupby("B").grouper.groupings[0] + + tm.assert_numpy_array_equal(gr1.grouper, df.A.values) + tm.assert_extension_array_equal(gr2.grouper, data_for_grouping) + + @pytest.mark.parametrize("as_index", [True, False]) + def test_groupby_extension_agg(self, as_index, data_for_grouping): + df = pd.DataFrame({"A": [1, 1, 2, 2, 3, 3, 1], "B": data_for_grouping}) + result = df.groupby("B", as_index=as_index).A.mean() + _, index = pd.factorize(data_for_grouping, sort=True) + + index = pd.Index(index, name="B") + expected = pd.Series([3, 1], index=index, name="A") + if as_index: + self.assert_series_equal(result, expected) + else: + expected = expected.reset_index() + self.assert_frame_equal(result, expected) + + def test_groupby_extension_no_sort(self, data_for_grouping): + df = pd.DataFrame({"A": [1, 1, 2, 2, 3, 3, 1], "B": data_for_grouping}) + result = df.groupby("B", sort=False).A.mean() + _, index = pd.factorize(data_for_grouping, sort=False) + + index = pd.Index(index, name="B") + expected = pd.Series([1, 3], index=index, name="A") + self.assert_series_equal(result, expected) + + def test_groupby_extension_transform(self, data_for_grouping): + valid = data_for_grouping[~data_for_grouping.isna()] + df = pd.DataFrame({"A": [1, 1, 3, 3, 1], "B": valid}) + + result = df.groupby("B").A.transform(len) + expected = pd.Series([3, 3, 2, 2, 3], name="A") + + self.assert_series_equal(result, expected) + + def test_groupby_extension_apply(self, data_for_grouping, groupby_apply_op): + df = pd.DataFrame({"A": [1, 1, 2, 2, 3, 3, 1], "B": data_for_grouping}) + df.groupby("B").apply(groupby_apply_op) + df.groupby("B").A.apply(groupby_apply_op) + df.groupby("A").apply(groupby_apply_op) + df.groupby("A").B.apply(groupby_apply_op) + + def test_groupby_apply_identity(self, data_for_grouping): + df = pd.DataFrame({"A": [1, 1, 2, 2, 3, 3, 1], "B": data_for_grouping}) + result = df.groupby("A").B.apply(lambda x: x.array) + expected = pd.Series( + [ + df.B.iloc[[0, 1, 6]].array, + df.B.iloc[[2, 3]].array, + df.B.iloc[[4, 5]].array, + ], + index=pd.Index([1, 2, 3], name="A"), + name="B", + ) + self.assert_series_equal(result, expected) + + def test_in_numeric_groupby(self, data_for_grouping): + df = pd.DataFrame( + { + "A": [1, 1, 2, 2, 3, 3, 1], + "B": data_for_grouping, + "C": [1, 1, 1, 1, 1, 1, 1], + } + ) + result = df.groupby("A").sum().columns + + if data_for_grouping.dtype._is_numeric: + expected = pd.Index(["B", "C"]) + else: + expected = pd.Index(["C"]) + + tm.assert_index_equal(result, expected) + + +class TestNumericReduce(base.BaseNumericReduceTests): + def check_reduce(self, s, op_name, skipna): + result = getattr(s, op_name)(skipna=skipna) + expected = getattr(s.astype("float64"), op_name)(skipna=skipna) + # override parent function to cast to bool for min/max + if op_name in ("min", "max") and not pd.isna(expected): + expected = bool(expected) + tm.assert_almost_equal(result, expected) + + +class TestBooleanReduce(base.BaseBooleanReduceTests): + pass + + +class TestPrinting(base.BasePrintingTests): + pass + + +# TODO parsing not yet supported +# class TestParsing(base.BaseParsingTests): +# pass