From 95f73c3dab1eb472d27121c09723b8abeab9eeb7 Mon Sep 17 00:00:00 2001 From: gfyoung Date: Fri, 21 Dec 2018 08:46:10 -0800 Subject: [PATCH 01/15] REF/TST: Add pytest idiom to test_testing (#24369) Splits test file into multiple new files and delegates remaining tests to the miscellaneous test_util.py --- ci/code_checks.sh | 2 +- pandas/tests/util/conftest.py | 26 + pandas/tests/util/test_assert_almost_equal.py | 350 +++++++ .../util/test_assert_categorical_equal.py | 92 ++ .../util/test_assert_extension_array_equal.py | 102 ++ pandas/tests/util/test_assert_frame_equal.py | 209 ++++ pandas/tests/util/test_assert_index_equal.py | 179 ++++ .../util/test_assert_interval_array_equal.py | 80 ++ .../util/test_assert_numpy_array_equal.py | 177 ++++ pandas/tests/util/test_assert_series_equal.py | 185 ++++ pandas/tests/util/test_testing.py | 984 ------------------ pandas/tests/util/test_util.py | 88 +- 12 files changed, 1484 insertions(+), 990 deletions(-) create mode 100644 pandas/tests/util/conftest.py create mode 100644 pandas/tests/util/test_assert_almost_equal.py create mode 100644 pandas/tests/util/test_assert_categorical_equal.py create mode 100644 pandas/tests/util/test_assert_extension_array_equal.py create mode 100644 pandas/tests/util/test_assert_frame_equal.py create mode 100644 pandas/tests/util/test_assert_index_equal.py create mode 100644 pandas/tests/util/test_assert_interval_array_equal.py create mode 100644 pandas/tests/util/test_assert_numpy_array_equal.py create mode 100644 pandas/tests/util/test_assert_series_equal.py delete mode 100644 pandas/tests/util/test_testing.py diff --git a/ci/code_checks.sh b/ci/code_checks.sh index 0c9a24801f98c..b594f6a2f8df6 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -145,7 +145,7 @@ if [[ -z "$CHECK" || "$CHECK" == "patterns" ]]; then RET=$(($RET + $?)) ; echo $MSG "DONE" MSG='Check that the deprecated `assert_raises_regex` is not used (`pytest.raises(match=pattern)` should be used instead)' ; echo $MSG - invgrep -R --exclude=*.pyc --exclude=testing.py --exclude=test_testing.py assert_raises_regex pandas + invgrep -R --exclude=*.pyc --exclude=testing.py --exclude=test_util.py assert_raises_regex pandas RET=$(($RET + $?)) ; echo $MSG "DONE" # Check that we use pytest.raises only as a context manager diff --git a/pandas/tests/util/conftest.py b/pandas/tests/util/conftest.py new file mode 100644 index 0000000000000..5eff49ab774b5 --- /dev/null +++ b/pandas/tests/util/conftest.py @@ -0,0 +1,26 @@ +import pytest + + +@pytest.fixture(params=[True, False]) +def check_dtype(request): + return request.param + + +@pytest.fixture(params=[True, False]) +def check_exact(request): + return request.param + + +@pytest.fixture(params=[True, False]) +def check_index_type(request): + return request.param + + +@pytest.fixture(params=[True, False]) +def check_less_precise(request): + return request.param + + +@pytest.fixture(params=[True, False]) +def check_categorical(request): + return request.param diff --git a/pandas/tests/util/test_assert_almost_equal.py b/pandas/tests/util/test_assert_almost_equal.py new file mode 100644 index 0000000000000..afee9c008295f --- /dev/null +++ b/pandas/tests/util/test_assert_almost_equal.py @@ -0,0 +1,350 @@ +# -*- coding: utf-8 -*- + +import numpy as np +import pytest + +from pandas import DataFrame, Index, Series, Timestamp +from pandas.util.testing import assert_almost_equal + + +def _assert_almost_equal_both(a, b, **kwargs): + """ + Check that two objects are approximately equal. + + This check is performed commutatively. + + Parameters + ---------- + a : object + The first object to compare. + b : object + The second object to compare. + kwargs : dict + The arguments passed to `assert_almost_equal`. + """ + assert_almost_equal(a, b, **kwargs) + assert_almost_equal(b, a, **kwargs) + + +def _assert_not_almost_equal(a, b, **kwargs): + """ + Check that two objects are not approximately equal. + + Parameters + ---------- + a : object + The first object to compare. + b : object + The second object to compare. + kwargs : dict + The arguments passed to `assert_almost_equal`. + """ + try: + assert_almost_equal(a, b, **kwargs) + msg = ("{a} and {b} were approximately equal " + "when they shouldn't have been").format(a=a, b=b) + pytest.fail(msg=msg) + except AssertionError: + pass + + +def _assert_not_almost_equal_both(a, b, **kwargs): + """ + Check that two objects are not approximately equal. + + This check is performed commutatively. + + Parameters + ---------- + a : object + The first object to compare. + b : object + The second object to compare. + kwargs : dict + The arguments passed to `tm.assert_almost_equal`. + """ + _assert_not_almost_equal(a, b, **kwargs) + _assert_not_almost_equal(b, a, **kwargs) + + +@pytest.mark.parametrize("a,b", [ + (1.1, 1.1), (1.1, 1.100001), (np.int16(1), 1.000001), + (np.float64(1.1), 1.1), (np.uint32(5), 5), +]) +def test_assert_almost_equal_numbers(a, b): + _assert_almost_equal_both(a, b) + + +@pytest.mark.parametrize("a,b", [ + (1.1, 1), (1.1, True), (1, 2), (1.0001, np.int16(1)), +]) +def test_assert_not_almost_equal_numbers(a, b): + _assert_not_almost_equal_both(a, b) + + +@pytest.mark.parametrize("a,b", [ + (0, 0), (0, 0.0), (0, np.float64(0)), (0.000001, 0), +]) +def test_assert_almost_equal_numbers_with_zeros(a, b): + _assert_almost_equal_both(a, b) + + +@pytest.mark.parametrize("a,b", [ + (0.001, 0), (1, 0), +]) +def test_assert_not_almost_equal_numbers_with_zeros(a, b): + _assert_not_almost_equal_both(a, b) + + +@pytest.mark.parametrize("a,b", [ + (1, "abc"), (1, [1, ]), (1, object()), +]) +def test_assert_not_almost_equal_numbers_with_mixed(a, b): + _assert_not_almost_equal_both(a, b) + + +@pytest.mark.parametrize( + "left_dtype", ["M8[ns]", "m8[ns]", "float64", "int64", "object"]) +@pytest.mark.parametrize( + "right_dtype", ["M8[ns]", "m8[ns]", "float64", "int64", "object"]) +def test_assert_almost_equal_edge_case_ndarrays(left_dtype, right_dtype): + # Empty compare. + _assert_almost_equal_both(np.array([], dtype=left_dtype), + np.array([], dtype=right_dtype), + check_dtype=False) + + +def test_assert_almost_equal_dicts(): + _assert_almost_equal_both({"a": 1, "b": 2}, {"a": 1, "b": 2}) + + +@pytest.mark.parametrize("a,b", [ + ({"a": 1, "b": 2}, {"a": 1, "b": 3}), + ({"a": 1, "b": 2}, {"a": 1, "b": 2, "c": 3}), + ({"a": 1}, 1), ({"a": 1}, "abc"), ({"a": 1}, [1, ]), +]) +def test_assert_not_almost_equal_dicts(a, b): + _assert_not_almost_equal_both(a, b) + + +@pytest.mark.parametrize("val", [1, 2]) +def test_assert_almost_equal_dict_like_object(val): + dict_val = 1 + real_dict = dict(a=val) + + class DictLikeObj(object): + def keys(self): + return "a", + + def __getitem__(self, item): + if item == "a": + return dict_val + + func = (_assert_almost_equal_both if val == dict_val + else _assert_not_almost_equal_both) + func(real_dict, DictLikeObj(), check_dtype=False) + + +def test_assert_almost_equal_strings(): + _assert_almost_equal_both("abc", "abc") + + +@pytest.mark.parametrize("a,b", [ + ("abc", "abcd"), ("abc", "abd"), ("abc", 1), ("abc", [1, ]), +]) +def test_assert_not_almost_equal_strings(a, b): + _assert_not_almost_equal_both(a, b) + + +@pytest.mark.parametrize("a,b", [ + ([1, 2, 3], [1, 2, 3]), (np.array([1, 2, 3]), np.array([1, 2, 3])), +]) +def test_assert_almost_equal_iterables(a, b): + _assert_almost_equal_both(a, b) + + +@pytest.mark.parametrize("a,b", [ + # Class is different. + (np.array([1, 2, 3]), [1, 2, 3]), + + # Dtype is different. + (np.array([1, 2, 3]), np.array([1., 2., 3.])), + + # Can't compare generators. + (iter([1, 2, 3]), [1, 2, 3]), ([1, 2, 3], [1, 2, 4]), + ([1, 2, 3], [1, 2, 3, 4]), ([1, 2, 3], 1), +]) +def test_assert_not_almost_equal_iterables(a, b): + _assert_not_almost_equal(a, b) + + +def test_assert_almost_equal_null(): + _assert_almost_equal_both(None, None) + + +@pytest.mark.parametrize("a,b", [ + (None, np.NaN), (None, 0), (np.NaN, 0), +]) +def test_assert_not_almost_equal_null(a, b): + _assert_not_almost_equal(a, b) + + +@pytest.mark.parametrize("a,b", [ + (np.inf, np.inf), (np.inf, float("inf")), + (np.array([np.inf, np.nan, -np.inf]), + np.array([np.inf, np.nan, -np.inf])), + (np.array([np.inf, None, -np.inf], dtype=np.object_), + np.array([np.inf, np.nan, -np.inf], dtype=np.object_)), +]) +def test_assert_almost_equal_inf(a, b): + _assert_almost_equal_both(a, b) + + +def test_assert_not_almost_equal_inf(): + _assert_not_almost_equal_both(np.inf, 0) + + +@pytest.mark.parametrize("a,b", [ + (Index([1., 1.1]), Index([1., 1.100001])), + (Series([1., 1.1]), Series([1., 1.100001])), + (np.array([1.1, 2.000001]), np.array([1.1, 2.0])), + (DataFrame({"a": [1., 1.1]}), DataFrame({"a": [1., 1.100001]})) +]) +def test_assert_almost_equal_pandas(a, b): + _assert_almost_equal_both(a, b) + + +def test_assert_almost_equal_object(): + a = [Timestamp("2011-01-01"), Timestamp("2011-01-01")] + b = [Timestamp("2011-01-01"), Timestamp("2011-01-01")] + _assert_almost_equal_both(a, b) + + +def test_assert_almost_equal_value_mismatch(): + msg = "expected 2\\.00000 but got 1\\.00000, with decimal 5" + + with pytest.raises(AssertionError, match=msg): + assert_almost_equal(1, 2) + + +@pytest.mark.parametrize("a,b,klass1,klass2", [ + (np.array([1]), 1, "ndarray", "int"), + (1, np.array([1]), "int", "ndarray"), +]) +def test_assert_almost_equal_class_mismatch(a, b, klass1, klass2): + msg = """numpy array are different + +numpy array classes are different +\\[left\\]: {klass1} +\\[right\\]: {klass2}""".format(klass1=klass1, klass2=klass2) + + with pytest.raises(AssertionError, match=msg): + assert_almost_equal(a, b) + + +def test_assert_almost_equal_value_mismatch1(): + msg = """numpy array are different + +numpy array values are different \\(66\\.66667 %\\) +\\[left\\]: \\[nan, 2\\.0, 3\\.0\\] +\\[right\\]: \\[1\\.0, nan, 3\\.0\\]""" + + with pytest.raises(AssertionError, match=msg): + assert_almost_equal(np.array([np.nan, 2, 3]), + np.array([1, np.nan, 3])) + + +def test_assert_almost_equal_value_mismatch2(): + msg = """numpy array are different + +numpy array values are different \\(50\\.0 %\\) +\\[left\\]: \\[1, 2\\] +\\[right\\]: \\[1, 3\\]""" + + with pytest.raises(AssertionError, match=msg): + assert_almost_equal(np.array([1, 2]), np.array([1, 3])) + + +def test_assert_almost_equal_value_mismatch3(): + msg = """numpy array are different + +numpy array values are different \\(16\\.66667 %\\) +\\[left\\]: \\[\\[1, 2\\], \\[3, 4\\], \\[5, 6\\]\\] +\\[right\\]: \\[\\[1, 3\\], \\[3, 4\\], \\[5, 6\\]\\]""" + + with pytest.raises(AssertionError, match=msg): + assert_almost_equal(np.array([[1, 2], [3, 4], [5, 6]]), + np.array([[1, 3], [3, 4], [5, 6]])) + + +def test_assert_almost_equal_value_mismatch4(): + msg = """numpy array are different + +numpy array values are different \\(25\\.0 %\\) +\\[left\\]: \\[\\[1, 2\\], \\[3, 4\\]\\] +\\[right\\]: \\[\\[1, 3\\], \\[3, 4\\]\\]""" + + with pytest.raises(AssertionError, match=msg): + assert_almost_equal(np.array([[1, 2], [3, 4]]), + np.array([[1, 3], [3, 4]])) + + +def test_assert_almost_equal_shape_mismatch_override(): + msg = """Index are different + +Index shapes are different +\\[left\\]: \\(2L*,\\) +\\[right\\]: \\(3L*,\\)""" + with pytest.raises(AssertionError, match=msg): + assert_almost_equal(np.array([1, 2]), + np.array([3, 4, 5]), + obj="Index") + + +def test_assert_almost_equal_unicode(): + # see gh-20503 + msg = """numpy array are different + +numpy array values are different \\(33\\.33333 %\\) +\\[left\\]: \\[á, à, ä\\] +\\[right\\]: \\[á, à, å\\]""" + + with pytest.raises(AssertionError, match=msg): + assert_almost_equal(np.array([u"á", u"à", u"ä"]), + np.array([u"á", u"à", u"å"])) + + +def test_assert_almost_equal_timestamp(): + a = np.array([Timestamp("2011-01-01"), Timestamp("2011-01-01")]) + b = np.array([Timestamp("2011-01-01"), Timestamp("2011-01-02")]) + + msg = """numpy array are different + +numpy array values are different \\(50\\.0 %\\) +\\[left\\]: \\[2011-01-01 00:00:00, 2011-01-01 00:00:00\\] +\\[right\\]: \\[2011-01-01 00:00:00, 2011-01-02 00:00:00\\]""" + + with pytest.raises(AssertionError, match=msg): + assert_almost_equal(a, b) + + +def test_assert_almost_equal_iterable_length_mismatch(): + msg = """Iterable are different + +Iterable length are different +\\[left\\]: 2 +\\[right\\]: 3""" + + with pytest.raises(AssertionError, match=msg): + assert_almost_equal([1, 2], [3, 4, 5]) + + +def test_assert_almost_equal_iterable_values_mismatch(): + msg = """Iterable are different + +Iterable values are different \\(50\\.0 %\\) +\\[left\\]: \\[1, 2\\] +\\[right\\]: \\[1, 3\\]""" + + with pytest.raises(AssertionError, match=msg): + assert_almost_equal([1, 2], [1, 3]) diff --git a/pandas/tests/util/test_assert_categorical_equal.py b/pandas/tests/util/test_assert_categorical_equal.py new file mode 100644 index 0000000000000..04c8301027039 --- /dev/null +++ b/pandas/tests/util/test_assert_categorical_equal.py @@ -0,0 +1,92 @@ +# -*- coding: utf-8 -*- + +import pytest + +from pandas import Categorical +from pandas.util.testing import assert_categorical_equal + + +@pytest.mark.parametrize("c", [ + Categorical([1, 2, 3, 4]), + Categorical([1, 2, 3, 4], categories=[1, 2, 3, 4, 5]), +]) +def test_categorical_equal(c): + assert_categorical_equal(c, c) + + +@pytest.mark.parametrize("check_category_order", [True, False]) +def test_categorical_equal_order_mismatch(check_category_order): + c1 = Categorical([1, 2, 3, 4], categories=[1, 2, 3, 4]) + c2 = Categorical([1, 2, 3, 4], categories=[4, 3, 2, 1]) + kwargs = dict(check_category_order=check_category_order) + + if check_category_order: + msg = """Categorical\\.categories are different + +Categorical\\.categories values are different \\(100\\.0 %\\) +\\[left\\]: Int64Index\\(\\[1, 2, 3, 4\\], dtype='int64'\\) +\\[right\\]: Int64Index\\(\\[4, 3, 2, 1\\], dtype='int64'\\)""" + with pytest.raises(AssertionError, match=msg): + assert_categorical_equal(c1, c2, **kwargs) + else: + assert_categorical_equal(c1, c2, **kwargs) + + +def test_categorical_equal_categories_mismatch(): + msg = """Categorical\\.categories are different + +Categorical\\.categories values are different \\(25\\.0 %\\) +\\[left\\]: Int64Index\\(\\[1, 2, 3, 4\\], dtype='int64'\\) +\\[right\\]: Int64Index\\(\\[1, 2, 3, 5\\], dtype='int64'\\)""" + + c1 = Categorical([1, 2, 3, 4]) + c2 = Categorical([1, 2, 3, 5]) + + with pytest.raises(AssertionError, match=msg): + assert_categorical_equal(c1, c2) + + +def test_categorical_equal_codes_mismatch(): + categories = [1, 2, 3, 4] + msg = """Categorical\\.codes are different + +Categorical\\.codes values are different \\(50\\.0 %\\) +\\[left\\]: \\[0, 1, 3, 2\\] +\\[right\\]: \\[0, 1, 2, 3\\]""" + + c1 = Categorical([1, 2, 4, 3], categories=categories) + c2 = Categorical([1, 2, 3, 4], categories=categories) + + with pytest.raises(AssertionError, match=msg): + assert_categorical_equal(c1, c2) + + +def test_categorical_equal_ordered_mismatch(): + data = [1, 2, 3, 4] + msg = """Categorical are different + +Attribute "ordered" are different +\\[left\\]: False +\\[right\\]: True""" + + c1 = Categorical(data, ordered=False) + c2 = Categorical(data, ordered=True) + + with pytest.raises(AssertionError, match=msg): + assert_categorical_equal(c1, c2) + + +@pytest.mark.parametrize("obj", ["index", "foo", "pandas"]) +def test_categorical_equal_object_override(obj): + data = [1, 2, 3, 4] + msg = """{obj} are different + +Attribute "ordered" are different +\\[left\\]: False +\\[right\\]: True""".format(obj=obj) + + c1 = Categorical(data, ordered=False) + c2 = Categorical(data, ordered=True) + + with pytest.raises(AssertionError, match=msg): + assert_categorical_equal(c1, c2, obj=obj) diff --git a/pandas/tests/util/test_assert_extension_array_equal.py b/pandas/tests/util/test_assert_extension_array_equal.py new file mode 100644 index 0000000000000..3149078a56783 --- /dev/null +++ b/pandas/tests/util/test_assert_extension_array_equal.py @@ -0,0 +1,102 @@ +# -*- coding: utf-8 -*- + +import numpy as np +import pytest + +from pandas.core.arrays.sparse import SparseArray +from pandas.util.testing import assert_extension_array_equal + + +@pytest.mark.parametrize("kwargs", [ + dict(), # Default is check_exact=False + dict(check_exact=False), dict(check_exact=True) +]) +def test_assert_extension_array_equal_not_exact(kwargs): + # see gh-23709 + arr1 = SparseArray([-0.17387645482451206, 0.3414148016424936]) + arr2 = SparseArray([-0.17387645482451206, 0.3414148016424937]) + + if kwargs.get("check_exact", False): + msg = """\ +ExtensionArray are different + +ExtensionArray values are different \\(50\\.0 %\\) +\\[left\\]: \\[-0\\.17387645482.*, 0\\.341414801642.*\\] +\\[right\\]: \\[-0\\.17387645482.*, 0\\.341414801642.*\\]""" + + with pytest.raises(AssertionError, match=msg): + assert_extension_array_equal(arr1, arr2, **kwargs) + else: + assert_extension_array_equal(arr1, arr2, **kwargs) + + +@pytest.mark.parametrize("check_less_precise", [ + True, False, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 +]) +def test_assert_extension_array_equal_less_precise(check_less_precise): + arr1 = SparseArray([0.5, 0.123456]) + arr2 = SparseArray([0.5, 0.123457]) + + kwargs = dict(check_less_precise=check_less_precise) + + if check_less_precise is False or check_less_precise >= 5: + msg = """\ +ExtensionArray are different + +ExtensionArray values are different \\(50\\.0 %\\) +\\[left\\]: \\[0\\.5, 0\\.123456\\] +\\[right\\]: \\[0\\.5, 0\\.123457\\]""" + + with pytest.raises(AssertionError, match=msg): + assert_extension_array_equal(arr1, arr2, **kwargs) + else: + assert_extension_array_equal(arr1, arr2, **kwargs) + + +def test_assert_extension_array_equal_dtype_mismatch(check_dtype): + end = 5 + kwargs = dict(check_dtype=check_dtype) + + arr1 = SparseArray(np.arange(end, dtype="int64")) + arr2 = SparseArray(np.arange(end, dtype="int32")) + + if check_dtype: + msg = """\ +ExtensionArray are different + +Attribute "dtype" are different +\\[left\\]: Sparse\\[int64, 0\\] +\\[right\\]: Sparse\\[int32, 0\\]""" + + with pytest.raises(AssertionError, match=msg): + assert_extension_array_equal(arr1, arr2, **kwargs) + else: + assert_extension_array_equal(arr1, arr2, **kwargs) + + +def test_assert_extension_array_equal_missing_values(): + arr1 = SparseArray([np.nan, 1, 2, np.nan]) + arr2 = SparseArray([np.nan, 1, 2, 3]) + + msg = """\ +ExtensionArray NA mask are different + +ExtensionArray NA mask values are different \\(25\\.0 %\\) +\\[left\\]: \\[True, False, False, True\\] +\\[right\\]: \\[True, False, False, False\\]""" + + with pytest.raises(AssertionError, match=msg): + assert_extension_array_equal(arr1, arr2) + + +@pytest.mark.parametrize("side", ["left", "right"]) +def test_assert_extension_array_equal_non_extension_array(side): + numpy_array = np.arange(5) + extension_array = SparseArray(numpy_array) + + msg = "{side} is not an ExtensionArray".format(side=side) + args = ((numpy_array, extension_array) if side == "left" + else (extension_array, numpy_array)) + + with pytest.raises(AssertionError, match=msg): + assert_extension_array_equal(*args) diff --git a/pandas/tests/util/test_assert_frame_equal.py b/pandas/tests/util/test_assert_frame_equal.py new file mode 100644 index 0000000000000..1a941c0f0c265 --- /dev/null +++ b/pandas/tests/util/test_assert_frame_equal.py @@ -0,0 +1,209 @@ +# -*- coding: utf-8 -*- + +import pytest + +from pandas import DataFrame +from pandas.util.testing import assert_frame_equal + + +@pytest.fixture(params=[True, False]) +def by_blocks(request): + return request.param + + +def _assert_frame_equal_both(a, b, **kwargs): + """ + Check that two DataFrame equal. + + This check is performed commutatively. + + Parameters + ---------- + a : DataFrame + The first DataFrame to compare. + b : DataFrame + The second DataFrame to compare. + kwargs : dict + The arguments passed to `assert_frame_equal`. + """ + assert_frame_equal(a, b, **kwargs) + assert_frame_equal(b, a, **kwargs) + + +def _assert_not_frame_equal(a, b, **kwargs): + """ + Check that two DataFrame are not equal. + + Parameters + ---------- + a : DataFrame + The first DataFrame to compare. + b : DataFrame + The second DataFrame to compare. + kwargs : dict + The arguments passed to `assert_frame_equal`. + """ + try: + assert_frame_equal(a, b, **kwargs) + msg = "The two DataFrames were equal when they shouldn't have been" + + pytest.fail(msg=msg) + except AssertionError: + pass + + +def _assert_not_frame_equal_both(a, b, **kwargs): + """ + Check that two DataFrame are not equal. + + This check is performed commutatively. + + Parameters + ---------- + a : DataFrame + The first DataFrame to compare. + b : DataFrame + The second DataFrame to compare. + kwargs : dict + The arguments passed to `assert_frame_equal`. + """ + _assert_not_frame_equal(a, b, **kwargs) + _assert_not_frame_equal(b, a, **kwargs) + + +@pytest.mark.parametrize("check_like", [True, False]) +def test_frame_equal_row_order_mismatch(check_like): + df1 = DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]}, + index=["a", "b", "c"]) + df2 = DataFrame({"A": [3, 2, 1], "B": [6, 5, 4]}, + index=["c", "b", "a"]) + + if not check_like: # Do not ignore row-column orderings. + msg = "DataFrame.index are different" + with pytest.raises(AssertionError, match=msg): + assert_frame_equal(df1, df2, check_like=check_like) + else: + _assert_frame_equal_both(df1, df2, check_like=check_like) + + +@pytest.mark.parametrize("df1,df2", [ + (DataFrame({"A": [1, 2, 3]}), DataFrame({"A": [1, 2, 3, 4]})), + (DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]}), DataFrame({"A": [1, 2, 3]})), +]) +def test_frame_equal_shape_mismatch(df1, df2): + msg = "DataFrame are different" + + with pytest.raises(AssertionError, match=msg): + assert_frame_equal(df1, df2) + + +@pytest.mark.parametrize("df1,df2,msg", [ + # Index + (DataFrame.from_records({"a": [1, 2], + "c": ["l1", "l2"]}, index=["a"]), + DataFrame.from_records({"a": [1.0, 2.0], + "c": ["l1", "l2"]}, index=["a"]), + "DataFrame\\.index are different"), + + # MultiIndex + (DataFrame.from_records({"a": [1, 2], "b": [2.1, 1.5], + "c": ["l1", "l2"]}, index=["a", "b"]), + DataFrame.from_records({"a": [1.0, 2.0], "b": [2.1, 1.5], + "c": ["l1", "l2"]}, index=["a", "b"]), + "MultiIndex level \\[0\\] are different") +]) +def test_frame_equal_index_dtype_mismatch(df1, df2, msg, check_index_type): + kwargs = dict(check_index_type=check_index_type) + + if check_index_type: + with pytest.raises(AssertionError, match=msg): + assert_frame_equal(df1, df2, **kwargs) + else: + assert_frame_equal(df1, df2, **kwargs) + + +def test_empty_dtypes(check_dtype): + columns = ["col1", "col2"] + df1 = DataFrame(columns=columns) + df2 = DataFrame(columns=columns) + + kwargs = dict(check_dtype=check_dtype) + df1["col1"] = df1["col1"].astype("int64") + + if check_dtype: + msg = "Attributes are different" + with pytest.raises(AssertionError, match=msg): + assert_frame_equal(df1, df2, **kwargs) + else: + assert_frame_equal(df1, df2, **kwargs) + + +def test_frame_equal_index_mismatch(): + msg = """DataFrame\\.index are different + +DataFrame\\.index values are different \\(33\\.33333 %\\) +\\[left\\]: Index\\(\\[u?'a', u?'b', u?'c'\\], dtype='object'\\) +\\[right\\]: Index\\(\\[u?'a', u?'b', u?'d'\\], dtype='object'\\)""" + + df1 = DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]}, + index=["a", "b", "c"]) + df2 = DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]}, + index=["a", "b", "d"]) + + with pytest.raises(AssertionError, match=msg): + assert_frame_equal(df1, df2) + + +def test_frame_equal_columns_mismatch(): + msg = """DataFrame\\.columns are different + +DataFrame\\.columns values are different \\(50\\.0 %\\) +\\[left\\]: Index\\(\\[u?'A', u?'B'\\], dtype='object'\\) +\\[right\\]: Index\\(\\[u?'A', u?'b'\\], dtype='object'\\)""" + + df1 = DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]}, + index=["a", "b", "c"]) + df2 = DataFrame({"A": [1, 2, 3], "b": [4, 5, 6]}, + index=["a", "b", "c"]) + + with pytest.raises(AssertionError, match=msg): + assert_frame_equal(df1, df2) + + +def test_frame_equal_block_mismatch(by_blocks): + msg = """DataFrame\\.iloc\\[:, 1\\] are different + +DataFrame\\.iloc\\[:, 1\\] values are different \\(33\\.33333 %\\) +\\[left\\]: \\[4, 5, 6\\] +\\[right\\]: \\[4, 5, 7\\]""" + + df1 = DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]}) + df2 = DataFrame({"A": [1, 2, 3], "B": [4, 5, 7]}) + + with pytest.raises(AssertionError, match=msg): + assert_frame_equal(df1, df2, by_blocks=by_blocks) + + +@pytest.mark.parametrize("df1,df2,msg", [ + (DataFrame({"A": [u"á", u"à", u"ä"], "E": [u"é", u"è", u"ë"]}), + DataFrame({"A": [u"á", u"à", u"ä"], "E": [u"é", u"è", u"e̊"]}), + """DataFrame\\.iloc\\[:, 1\\] are different + +DataFrame\\.iloc\\[:, 1\\] values are different \\(33\\.33333 %\\) +\\[left\\]: \\[é, è, ë\\] +\\[right\\]: \\[é, è, e̊\\]"""), + (DataFrame({"A": [u"á", u"à", u"ä"], "E": [u"é", u"è", u"ë"]}), + DataFrame({"A": ["a", "a", "a"], "E": ["e", "e", "e"]}), + """DataFrame\\.iloc\\[:, 0\\] are different + +DataFrame\\.iloc\\[:, 0\\] values are different \\(100\\.0 %\\) +\\[left\\]: \\[á, à, ä\\] +\\[right\\]: \\[a, a, a\\]"""), +]) +def test_frame_equal_unicode(df1, df2, msg, by_blocks): + # see gh-20503 + # + # Test ensures that `assert_frame_equals` raises the right exception + # when comparing DataFrames containing differing unicode objects. + with pytest.raises(AssertionError, match=msg): + assert_frame_equal(df1, df2, by_blocks=by_blocks) diff --git a/pandas/tests/util/test_assert_index_equal.py b/pandas/tests/util/test_assert_index_equal.py new file mode 100644 index 0000000000000..b96345d4bd7ce --- /dev/null +++ b/pandas/tests/util/test_assert_index_equal.py @@ -0,0 +1,179 @@ +# -*- coding: utf-8 -*- + +import numpy as np +import pytest + +from pandas import Categorical, Index, MultiIndex, NaT +from pandas.util.testing import assert_index_equal + + +def test_index_equal_levels_mismatch(): + msg = """Index are different + +Index levels are different +\\[left\\]: 1, Int64Index\\(\\[1, 2, 3\\], dtype='int64'\\) +\\[right\\]: 2, MultiIndex\\(levels=\\[\\[u?'A', u?'B'\\], \\[1, 2, 3, 4\\]\\], + labels=\\[\\[0, 0, 1, 1\\], \\[0, 1, 2, 3\\]\\]\\)""" + + idx1 = Index([1, 2, 3]) + idx2 = MultiIndex.from_tuples([("A", 1), ("A", 2), + ("B", 3), ("B", 4)]) + + with pytest.raises(AssertionError, match=msg): + assert_index_equal(idx1, idx2, exact=False) + + +def test_index_equal_values_mismatch(check_exact): + msg = """MultiIndex level \\[1\\] are different + +MultiIndex level \\[1\\] values are different \\(25\\.0 %\\) +\\[left\\]: Int64Index\\(\\[2, 2, 3, 4\\], dtype='int64'\\) +\\[right\\]: Int64Index\\(\\[1, 2, 3, 4\\], dtype='int64'\\)""" + + idx1 = MultiIndex.from_tuples([("A", 2), ("A", 2), + ("B", 3), ("B", 4)]) + idx2 = MultiIndex.from_tuples([("A", 1), ("A", 2), + ("B", 3), ("B", 4)]) + + with pytest.raises(AssertionError, match=msg): + assert_index_equal(idx1, idx2, check_exact=check_exact) + + +def test_index_equal_length_mismatch(check_exact): + msg = """Index are different + +Index length are different +\\[left\\]: 3, Int64Index\\(\\[1, 2, 3\\], dtype='int64'\\) +\\[right\\]: 4, Int64Index\\(\\[1, 2, 3, 4\\], dtype='int64'\\)""" + + idx1 = Index([1, 2, 3]) + idx2 = Index([1, 2, 3, 4]) + + with pytest.raises(AssertionError, match=msg): + assert_index_equal(idx1, idx2, check_exact=check_exact) + + +def test_index_equal_class_mismatch(check_exact): + msg = """Index are different + +Index classes are different +\\[left\\]: Int64Index\\(\\[1, 2, 3\\], dtype='int64'\\) +\\[right\\]: Float64Index\\(\\[1\\.0, 2\\.0, 3\\.0\\], dtype='float64'\\)""" + + idx1 = Index([1, 2, 3]) + idx2 = Index([1, 2, 3.0]) + + with pytest.raises(AssertionError, match=msg): + assert_index_equal(idx1, idx2, exact=True, check_exact=check_exact) + + +def test_index_equal_values_close(check_exact): + idx1 = Index([1, 2, 3.]) + idx2 = Index([1, 2, 3.0000000001]) + + if check_exact: + msg = """Index are different + +Index values are different \\(33\\.33333 %\\) +\\[left\\]: Float64Index\\(\\[1.0, 2.0, 3.0], dtype='float64'\\) +\\[right\\]: Float64Index\\(\\[1.0, 2.0, 3.0000000001\\], dtype='float64'\\)""" + + with pytest.raises(AssertionError, match=msg): + assert_index_equal(idx1, idx2, check_exact=check_exact) + else: + assert_index_equal(idx1, idx2, check_exact=check_exact) + + +def test_index_equal_values_less_close(check_exact, check_less_precise): + idx1 = Index([1, 2, 3.]) + idx2 = Index([1, 2, 3.0001]) + kwargs = dict(check_exact=check_exact, + check_less_precise=check_less_precise) + + if check_exact or not check_less_precise: + msg = """Index are different + +Index values are different \\(33\\.33333 %\\) +\\[left\\]: Float64Index\\(\\[1.0, 2.0, 3.0], dtype='float64'\\) +\\[right\\]: Float64Index\\(\\[1.0, 2.0, 3.0001\\], dtype='float64'\\)""" + + with pytest.raises(AssertionError, match=msg): + assert_index_equal(idx1, idx2, **kwargs) + else: + assert_index_equal(idx1, idx2, **kwargs) + + +def test_index_equal_values_too_far(check_exact, check_less_precise): + idx1 = Index([1, 2, 3]) + idx2 = Index([1, 2, 4]) + kwargs = dict(check_exact=check_exact, + check_less_precise=check_less_precise) + + msg = """Index are different + +Index values are different \\(33\\.33333 %\\) +\\[left\\]: Int64Index\\(\\[1, 2, 3\\], dtype='int64'\\) +\\[right\\]: Int64Index\\(\\[1, 2, 4\\], dtype='int64'\\)""" + + with pytest.raises(AssertionError, match=msg): + assert_index_equal(idx1, idx2, **kwargs) + + +def test_index_equal_level_values_mismatch(check_exact, check_less_precise): + idx1 = MultiIndex.from_tuples([("A", 2), ("A", 2), + ("B", 3), ("B", 4)]) + idx2 = MultiIndex.from_tuples([("A", 1), ("A", 2), + ("B", 3), ("B", 4)]) + kwargs = dict(check_exact=check_exact, + check_less_precise=check_less_precise) + + msg = """MultiIndex level \\[1\\] are different + +MultiIndex level \\[1\\] values are different \\(25\\.0 %\\) +\\[left\\]: Int64Index\\(\\[2, 2, 3, 4\\], dtype='int64'\\) +\\[right\\]: Int64Index\\(\\[1, 2, 3, 4\\], dtype='int64'\\)""" + + with pytest.raises(AssertionError, match=msg): + assert_index_equal(idx1, idx2, **kwargs) + + +@pytest.mark.parametrize("name1,name2", [ + (None, "x"), ("x", "x"), (np.nan, np.nan), (NaT, NaT), (np.nan, NaT) +]) +def test_index_equal_names(name1, name2): + msg = """Index are different + +Attribute "names" are different +\\[left\\]: \\[{name1}\\] +\\[right\\]: \\[{name2}\\]""" + + idx1 = Index([1, 2, 3], name=name1) + idx2 = Index([1, 2, 3], name=name2) + + if name1 == name2 or name1 is name2: + assert_index_equal(idx1, idx2) + else: + name1 = "u?'x'" if name1 == "x" else name1 + name2 = "u?'x'" if name2 == "x" else name2 + msg = msg.format(name1=name1, name2=name2) + + with pytest.raises(AssertionError, match=msg): + assert_index_equal(idx1, idx2) + + +def test_index_equal_category_mismatch(check_categorical): + msg = """Index are different + +Attribute "dtype" are different +\\[left\\]: CategoricalDtype\\(categories=\\[u?'a', u?'b'\\], ordered=False\\) +\\[right\\]: CategoricalDtype\\(categories=\\[u?'a', u?'b', u?'c'\\], \ +ordered=False\\)""" + + idx1 = Index(Categorical(["a", "b"])) + idx2 = Index(Categorical(["a", "b"], categories=["a", "b", "c"])) + + if check_categorical: + with pytest.raises(AssertionError, match=msg): + assert_index_equal(idx1, idx2, check_categorical=check_categorical) + else: + assert_index_equal(idx1, idx2, check_categorical=check_categorical) diff --git a/pandas/tests/util/test_assert_interval_array_equal.py b/pandas/tests/util/test_assert_interval_array_equal.py new file mode 100644 index 0000000000000..c81a27f9b3f19 --- /dev/null +++ b/pandas/tests/util/test_assert_interval_array_equal.py @@ -0,0 +1,80 @@ +# -*- coding: utf-8 -*- + +import pytest + +from pandas import interval_range +from pandas.util.testing import assert_interval_array_equal + + +@pytest.mark.parametrize("kwargs", [ + dict(start=0, periods=4), + dict(start=1, periods=5), + dict(start=5, end=10, closed="left"), +]) +def test_interval_array_equal(kwargs): + arr = interval_range(**kwargs).values + assert_interval_array_equal(arr, arr) + + +def test_interval_array_equal_closed_mismatch(): + kwargs = dict(start=0, periods=5) + arr1 = interval_range(closed="left", **kwargs).values + arr2 = interval_range(closed="right", **kwargs).values + + msg = """\ +IntervalArray are different + +Attribute "closed" are different +\\[left\\]: left +\\[right\\]: right""" + + with pytest.raises(AssertionError, match=msg): + assert_interval_array_equal(arr1, arr2) + + +def test_interval_array_equal_periods_mismatch(): + kwargs = dict(start=0) + arr1 = interval_range(periods=5, **kwargs).values + arr2 = interval_range(periods=6, **kwargs).values + + msg = """\ +IntervalArray.left are different + +IntervalArray.left length are different +\\[left\\]: 5, Int64Index\\(\\[0, 1, 2, 3, 4\\], dtype='int64'\\) +\\[right\\]: 6, Int64Index\\(\\[0, 1, 2, 3, 4, 5\\], dtype='int64'\\)""" + + with pytest.raises(AssertionError, match=msg): + assert_interval_array_equal(arr1, arr2) + + +def test_interval_array_equal_end_mismatch(): + kwargs = dict(start=0, periods=5) + arr1 = interval_range(end=10, **kwargs).values + arr2 = interval_range(end=20, **kwargs).values + + msg = """\ +IntervalArray.left are different + +IntervalArray.left values are different \\(80.0 %\\) +\\[left\\]: Int64Index\\(\\[0, 2, 4, 6, 8\\], dtype='int64'\\) +\\[right\\]: Int64Index\\(\\[0, 4, 8, 12, 16\\], dtype='int64'\\)""" + + with pytest.raises(AssertionError, match=msg): + assert_interval_array_equal(arr1, arr2) + + +def test_interval_array_equal_start_mismatch(): + kwargs = dict(periods=4) + arr1 = interval_range(start=0, **kwargs).values + arr2 = interval_range(start=1, **kwargs).values + + msg = """\ +IntervalArray.left are different + +IntervalArray.left values are different \\(100.0 %\\) +\\[left\\]: Int64Index\\(\\[0, 1, 2, 3\\], dtype='int64'\\) +\\[right\\]: Int64Index\\(\\[1, 2, 3, 4\\], dtype='int64'\\)""" + + with pytest.raises(AssertionError, match=msg): + assert_interval_array_equal(arr1, arr2) diff --git a/pandas/tests/util/test_assert_numpy_array_equal.py b/pandas/tests/util/test_assert_numpy_array_equal.py new file mode 100644 index 0000000000000..99037fcf96194 --- /dev/null +++ b/pandas/tests/util/test_assert_numpy_array_equal.py @@ -0,0 +1,177 @@ +# -*- coding: utf-8 -*- + +import numpy as np +import pytest + +from pandas import Timestamp +from pandas.util.testing import assert_numpy_array_equal + + +def test_assert_numpy_array_equal_shape_mismatch(): + msg = """numpy array are different + +numpy array shapes are different +\\[left\\]: \\(2L*,\\) +\\[right\\]: \\(3L*,\\)""" + + with pytest.raises(AssertionError, match=msg): + assert_numpy_array_equal(np.array([1, 2]), np.array([3, 4, 5])) + + +def test_assert_numpy_array_equal_bad_type(): + expected = "Expected type" + + with pytest.raises(AssertionError, match=expected): + assert_numpy_array_equal(1, 2) + + +@pytest.mark.parametrize("a,b,klass1,klass2", [ + (np.array([1]), 1, "ndarray", "int"), + (1, np.array([1]), "int", "ndarray"), +]) +def test_assert_numpy_array_equal_class_mismatch(a, b, klass1, klass2): + msg = """numpy array are different + +numpy array classes are different +\\[left\\]: {klass1} +\\[right\\]: {klass2}""".format(klass1=klass1, klass2=klass2) + + with pytest.raises(AssertionError, match=msg): + assert_numpy_array_equal(a, b) + + +def test_assert_numpy_array_equal_value_mismatch1(): + msg = """numpy array are different + +numpy array values are different \\(66\\.66667 %\\) +\\[left\\]: \\[nan, 2\\.0, 3\\.0\\] +\\[right\\]: \\[1\\.0, nan, 3\\.0\\]""" + + with pytest.raises(AssertionError, match=msg): + assert_numpy_array_equal(np.array([np.nan, 2, 3]), + np.array([1, np.nan, 3])) + + +def test_assert_numpy_array_equal_value_mismatch2(): + msg = """numpy array are different + +numpy array values are different \\(50\\.0 %\\) +\\[left\\]: \\[1, 2\\] +\\[right\\]: \\[1, 3\\]""" + + with pytest.raises(AssertionError, match=msg): + assert_numpy_array_equal(np.array([1, 2]), np.array([1, 3])) + + +def test_assert_numpy_array_equal_value_mismatch3(): + msg = """numpy array are different + +numpy array values are different \\(16\\.66667 %\\) +\\[left\\]: \\[\\[1, 2\\], \\[3, 4\\], \\[5, 6\\]\\] +\\[right\\]: \\[\\[1, 3\\], \\[3, 4\\], \\[5, 6\\]\\]""" + + with pytest.raises(AssertionError, match=msg): + assert_numpy_array_equal(np.array([[1, 2], [3, 4], [5, 6]]), + np.array([[1, 3], [3, 4], [5, 6]])) + + +def test_assert_numpy_array_equal_value_mismatch4(): + msg = """numpy array are different + +numpy array values are different \\(50\\.0 %\\) +\\[left\\]: \\[1\\.1, 2\\.000001\\] +\\[right\\]: \\[1\\.1, 2.0\\]""" + + with pytest.raises(AssertionError, match=msg): + assert_numpy_array_equal(np.array([1.1, 2.000001]), + np.array([1.1, 2.0])) + + +def test_assert_numpy_array_equal_value_mismatch5(): + msg = """numpy array are different + +numpy array values are different \\(16\\.66667 %\\) +\\[left\\]: \\[\\[1, 2\\], \\[3, 4\\], \\[5, 6\\]\\] +\\[right\\]: \\[\\[1, 3\\], \\[3, 4\\], \\[5, 6\\]\\]""" + + with pytest.raises(AssertionError, match=msg): + assert_numpy_array_equal(np.array([[1, 2], [3, 4], [5, 6]]), + np.array([[1, 3], [3, 4], [5, 6]])) + + +def test_assert_numpy_array_equal_value_mismatch6(): + msg = """numpy array are different + +numpy array values are different \\(25\\.0 %\\) +\\[left\\]: \\[\\[1, 2\\], \\[3, 4\\]\\] +\\[right\\]: \\[\\[1, 3\\], \\[3, 4\\]\\]""" + + with pytest.raises(AssertionError, match=msg): + assert_numpy_array_equal(np.array([[1, 2], [3, 4]]), + np.array([[1, 3], [3, 4]])) + + +def test_assert_numpy_array_equal_shape_mismatch_override(): + msg = """Index are different + +Index shapes are different +\\[left\\]: \\(2L*,\\) +\\[right\\]: \\(3L*,\\)""" + + with pytest.raises(AssertionError, match=msg): + assert_numpy_array_equal(np.array([1, 2]), + np.array([3, 4, 5]), + obj="Index") + + +def test_numpy_array_equal_unicode(): + # see gh-20503 + # + # Test ensures that `assert_numpy_array_equals` raises the right + # exception when comparing np.arrays containing differing unicode objects. + msg = """numpy array are different + +numpy array values are different \\(33\\.33333 %\\) +\\[left\\]: \\[á, à, ä\\] +\\[right\\]: \\[á, à, å\\]""" + + with pytest.raises(AssertionError, match=msg): + assert_numpy_array_equal(np.array([u"á", u"à", u"ä"]), + np.array([u"á", u"à", u"å"])) + + +def test_numpy_array_equal_object(): + a = np.array([Timestamp("2011-01-01"), Timestamp("2011-01-01")]) + b = np.array([Timestamp("2011-01-01"), Timestamp("2011-01-02")]) + + msg = """numpy array are different + +numpy array values are different \\(50\\.0 %\\) +\\[left\\]: \\[2011-01-01 00:00:00, 2011-01-01 00:00:00\\] +\\[right\\]: \\[2011-01-01 00:00:00, 2011-01-02 00:00:00\\]""" + + with pytest.raises(AssertionError, match=msg): + assert_numpy_array_equal(a, b) + + +@pytest.mark.parametrize("other_type", ["same", "copy"]) +@pytest.mark.parametrize("check_same", ["same", "copy"]) +def test_numpy_array_equal_copy_flag(other_type, check_same): + a = np.array([1, 2, 3]) + msg = None + + if other_type == "same": + other = a.view() + else: + other = a.copy() + + if check_same != other_type: + msg = (r"array\(\[1, 2, 3\]\) is not array\(\[1, 2, 3\]\)" + if check_same == "same" + else r"array\(\[1, 2, 3\]\) is array\(\[1, 2, 3\]\)") + + if msg is not None: + with pytest.raises(AssertionError, match=msg): + assert_numpy_array_equal(a, other, check_same=check_same) + else: + assert_numpy_array_equal(a, other, check_same=check_same) diff --git a/pandas/tests/util/test_assert_series_equal.py b/pandas/tests/util/test_assert_series_equal.py new file mode 100644 index 0000000000000..537a0e01ff85f --- /dev/null +++ b/pandas/tests/util/test_assert_series_equal.py @@ -0,0 +1,185 @@ +# -*- coding: utf-8 -*- + +import pytest + +from pandas import Categorical, DataFrame, Series +from pandas.util.testing import assert_series_equal + + +def _assert_series_equal_both(a, b, **kwargs): + """ + Check that two Series equal. + + This check is performed commutatively. + + Parameters + ---------- + a : Series + The first Series to compare. + b : Series + The second Series to compare. + kwargs : dict + The arguments passed to `assert_series_equal`. + """ + assert_series_equal(a, b, **kwargs) + assert_series_equal(b, a, **kwargs) + + +def _assert_not_series_equal(a, b, **kwargs): + """ + Check that two Series are not equal. + + Parameters + ---------- + a : Series + The first Series to compare. + b : Series + The second Series to compare. + kwargs : dict + The arguments passed to `assert_series_equal`. + """ + try: + assert_series_equal(a, b, **kwargs) + msg = "The two Series were equal when they shouldn't have been" + + pytest.fail(msg=msg) + except AssertionError: + pass + + +def _assert_not_series_equal_both(a, b, **kwargs): + """ + Check that two Series are not equal. + + This check is performed commutatively. + + Parameters + ---------- + a : Series + The first Series to compare. + b : Series + The second Series to compare. + kwargs : dict + The arguments passed to `assert_series_equal`. + """ + _assert_not_series_equal(a, b, **kwargs) + _assert_not_series_equal(b, a, **kwargs) + + +@pytest.mark.parametrize("data", [ + range(3), list("abc"), list(u"áàä"), +]) +def test_series_equal(data): + _assert_series_equal_both(Series(data), Series(data)) + + +@pytest.mark.parametrize("data1,data2", [ + (range(3), range(1, 4)), + (list("abc"), list("xyz")), + (list(u"áàä"), list(u"éèë")), + (list(u"áàä"), list(b"aaa")), + (range(3), range(4)), +]) +def test_series_not_equal_value_mismatch(data1, data2): + _assert_not_series_equal_both(Series(data1), Series(data2)) + + +@pytest.mark.parametrize("kwargs", [ + dict(dtype="float64"), # dtype mismatch + dict(index=[1, 2, 4]), # index mismatch + dict(name="foo"), # name mismatch +]) +def test_series_not_equal_metadata_mismatch(kwargs): + data = range(3) + s1 = Series(data) + + s2 = Series(data, **kwargs) + _assert_not_series_equal_both(s1, s2) + + +@pytest.mark.parametrize("data1,data2", [(0.12345, 0.12346), (0.1235, 0.1236)]) +@pytest.mark.parametrize("dtype", ["float32", "float64"]) +@pytest.mark.parametrize("check_less_precise", [False, True, 0, 1, 2, 3, 10]) +def test_less_precise(data1, data2, dtype, check_less_precise): + s1 = Series([data1], dtype=dtype) + s2 = Series([data2], dtype=dtype) + + kwargs = dict(check_less_precise=check_less_precise) + + if ((check_less_precise is False or check_less_precise == 10) or + ((check_less_precise is True or check_less_precise >= 3) and + abs(data1 - data2) >= 0.0001)): + msg = "Series values are different" + with pytest.raises(AssertionError, match=msg): + assert_series_equal(s1, s2, **kwargs) + else: + _assert_series_equal_both(s1, s2, **kwargs) + + +@pytest.mark.parametrize("s1,s2,msg", [ + # Index + (Series(["l1", "l2"], index=[1, 2]), + Series(["l1", "l2"], index=[1., 2.]), + "Series\\.index are different"), + + # MultiIndex + (DataFrame.from_records({"a": [1, 2], "b": [2.1, 1.5], + "c": ["l1", "l2"]}, index=["a", "b"]).c, + DataFrame.from_records({"a": [1., 2.], "b": [2.1, 1.5], + "c": ["l1", "l2"]}, index=["a", "b"]).c, + "MultiIndex level \\[0\\] are different") +]) +def test_series_equal_index_dtype(s1, s2, msg, check_index_type): + kwargs = dict(check_index_type=check_index_type) + + if check_index_type: + with pytest.raises(AssertionError, match=msg): + assert_series_equal(s1, s2, **kwargs) + else: + assert_series_equal(s1, s2, **kwargs) + + +def test_series_equal_length_mismatch(check_less_precise): + msg = """Series are different + +Series length are different +\\[left\\]: 3, RangeIndex\\(start=0, stop=3, step=1\\) +\\[right\\]: 4, RangeIndex\\(start=0, stop=4, step=1\\)""" + + s1 = Series([1, 2, 3]) + s2 = Series([1, 2, 3, 4]) + + with pytest.raises(AssertionError, match=msg): + assert_series_equal(s1, s2, check_less_precise=check_less_precise) + + +def test_series_equal_values_mismatch(check_less_precise): + msg = """Series are different + +Series values are different \\(33\\.33333 %\\) +\\[left\\]: \\[1, 2, 3\\] +\\[right\\]: \\[1, 2, 4\\]""" + + s1 = Series([1, 2, 3]) + s2 = Series([1, 2, 4]) + + with pytest.raises(AssertionError, match=msg): + assert_series_equal(s1, s2, check_less_precise=check_less_precise) + + +def test_series_equal_categorical_mismatch(check_categorical): + msg = """Attributes are different + +Attribute "dtype" are different +\\[left\\]: CategoricalDtype\\(categories=\\[u?'a', u?'b'\\], ordered=False\\) +\\[right\\]: CategoricalDtype\\(categories=\\[u?'a', u?'b', u?'c'\\], \ +ordered=False\\)""" + + s1 = Series(Categorical(["a", "b"])) + s2 = Series(Categorical(["a", "b"], categories=list("abc"))) + + if check_categorical: + with pytest.raises(AssertionError, match=msg): + assert_series_equal(s1, s2, check_categorical=check_categorical) + else: + _assert_series_equal_both(s1, s2, check_categorical=check_categorical) diff --git a/pandas/tests/util/test_testing.py b/pandas/tests/util/test_testing.py deleted file mode 100644 index e649cea14ec39..0000000000000 --- a/pandas/tests/util/test_testing.py +++ /dev/null @@ -1,984 +0,0 @@ -# -*- coding: utf-8 -*- -import os -import sys -import textwrap - -import numpy as np -import pytest - -from pandas.compat import raise_with_traceback -import pandas.util._test_decorators as td - -import pandas as pd -from pandas import DataFrame, Series, compat -from pandas.core.arrays.sparse import SparseArray -import pandas.util.testing as tm -from pandas.util.testing import ( - RNGContext, assert_almost_equal, assert_extension_array_equal, - assert_frame_equal, assert_index_equal, assert_numpy_array_equal, - assert_series_equal) - - -class TestAssertAlmostEqual(object): - - def _assert_almost_equal_both(self, a, b, **kwargs): - assert_almost_equal(a, b, **kwargs) - assert_almost_equal(b, a, **kwargs) - - def _assert_not_almost_equal_both(self, a, b, **kwargs): - pytest.raises(AssertionError, assert_almost_equal, a, b, **kwargs) - pytest.raises(AssertionError, assert_almost_equal, b, a, **kwargs) - - def test_assert_almost_equal_numbers(self): - self._assert_almost_equal_both(1.1, 1.1) - self._assert_almost_equal_both(1.1, 1.100001) - self._assert_almost_equal_both(np.int16(1), 1.000001) - self._assert_almost_equal_both(np.float64(1.1), 1.1) - self._assert_almost_equal_both(np.uint32(5), 5) - - self._assert_not_almost_equal_both(1.1, 1) - self._assert_not_almost_equal_both(1.1, True) - self._assert_not_almost_equal_both(1, 2) - self._assert_not_almost_equal_both(1.0001, np.int16(1)) - - def test_assert_almost_equal_numbers_with_zeros(self): - self._assert_almost_equal_both(0, 0) - self._assert_almost_equal_both(0, 0.0) - self._assert_almost_equal_both(0, np.float64(0)) - self._assert_almost_equal_both(0.000001, 0) - - self._assert_not_almost_equal_both(0.001, 0) - self._assert_not_almost_equal_both(1, 0) - - def test_assert_almost_equal_numbers_with_mixed(self): - self._assert_not_almost_equal_both(1, 'abc') - self._assert_not_almost_equal_both(1, [1, ]) - self._assert_not_almost_equal_both(1, object()) - - @pytest.mark.parametrize( - "left_dtype", - ['M8[ns]', 'm8[ns]', 'float64', 'int64', 'object']) - @pytest.mark.parametrize( - "right_dtype", - ['M8[ns]', 'm8[ns]', 'float64', 'int64', 'object']) - def test_assert_almost_equal_edge_case_ndarrays( - self, left_dtype, right_dtype): - - # empty compare - self._assert_almost_equal_both(np.array([], dtype=left_dtype), - np.array([], dtype=right_dtype), - check_dtype=False) - - def test_assert_almost_equal_dicts(self): - self._assert_almost_equal_both({'a': 1, 'b': 2}, {'a': 1, 'b': 2}) - - self._assert_not_almost_equal_both({'a': 1, 'b': 2}, {'a': 1, 'b': 3}) - self._assert_not_almost_equal_both({'a': 1, 'b': 2}, - {'a': 1, 'b': 2, 'c': 3}) - self._assert_not_almost_equal_both({'a': 1}, 1) - self._assert_not_almost_equal_both({'a': 1}, 'abc') - self._assert_not_almost_equal_both({'a': 1}, [1, ]) - - def test_assert_almost_equal_dict_like_object(self): - class DictLikeObj(object): - - def keys(self): - return ('a', ) - - def __getitem__(self, item): - if item == 'a': - return 1 - - self._assert_almost_equal_both({'a': 1}, DictLikeObj(), - check_dtype=False) - - self._assert_not_almost_equal_both({'a': 2}, DictLikeObj(), - check_dtype=False) - - def test_assert_almost_equal_strings(self): - self._assert_almost_equal_both('abc', 'abc') - - self._assert_not_almost_equal_both('abc', 'abcd') - self._assert_not_almost_equal_both('abc', 'abd') - self._assert_not_almost_equal_both('abc', 1) - self._assert_not_almost_equal_both('abc', [1, ]) - - def test_assert_almost_equal_iterables(self): - self._assert_almost_equal_both([1, 2, 3], [1, 2, 3]) - self._assert_almost_equal_both(np.array([1, 2, 3]), - np.array([1, 2, 3])) - - # class / dtype are different - self._assert_not_almost_equal_both(np.array([1, 2, 3]), [1, 2, 3]) - self._assert_not_almost_equal_both(np.array([1, 2, 3]), - np.array([1., 2., 3.])) - - # Can't compare generators - self._assert_not_almost_equal_both(iter([1, 2, 3]), [1, 2, 3]) - - self._assert_not_almost_equal_both([1, 2, 3], [1, 2, 4]) - self._assert_not_almost_equal_both([1, 2, 3], [1, 2, 3, 4]) - self._assert_not_almost_equal_both([1, 2, 3], 1) - - def test_assert_almost_equal_null(self): - self._assert_almost_equal_both(None, None) - - self._assert_not_almost_equal_both(None, np.NaN) - self._assert_not_almost_equal_both(None, 0) - self._assert_not_almost_equal_both(np.NaN, 0) - - def test_assert_almost_equal_inf(self): - self._assert_almost_equal_both(np.inf, np.inf) - self._assert_almost_equal_both(np.inf, float("inf")) - self._assert_not_almost_equal_both(np.inf, 0) - self._assert_almost_equal_both(np.array([np.inf, np.nan, -np.inf]), - np.array([np.inf, np.nan, -np.inf])) - self._assert_almost_equal_both(np.array([np.inf, None, -np.inf], - dtype=np.object_), - np.array([np.inf, np.nan, -np.inf], - dtype=np.object_)) - - def test_assert_almost_equal_pandas(self): - tm.assert_almost_equal(pd.Index([1., 1.1]), - pd.Index([1., 1.100001])) - tm.assert_almost_equal(pd.Series([1., 1.1]), - pd.Series([1., 1.100001])) - tm.assert_almost_equal(pd.DataFrame({'a': [1., 1.1]}), - pd.DataFrame({'a': [1., 1.100001]})) - - def test_assert_almost_equal_object(self): - a = [pd.Timestamp('2011-01-01'), pd.Timestamp('2011-01-01')] - b = [pd.Timestamp('2011-01-01'), pd.Timestamp('2011-01-01')] - self._assert_almost_equal_both(a, b) - - -class TestUtilTesting(object): - - def test_raise_with_traceback(self): - with pytest.raises(LookupError, match="error_text"): - try: - raise ValueError("THIS IS AN ERROR") - except ValueError as e: - e = LookupError("error_text") - raise_with_traceback(e) - with pytest.raises(LookupError, match="error_text"): - try: - raise ValueError("This is another error") - except ValueError: - e = LookupError("error_text") - _, _, traceback = sys.exc_info() - raise_with_traceback(e, traceback) - - def test_convert_rows_list_to_csv_str(self): - rows_list = ["aaa", "bbb", "ccc"] - ret = tm.convert_rows_list_to_csv_str(rows_list) - - if compat.is_platform_windows(): - expected = "aaa\r\nbbb\r\nccc\r\n" - else: - expected = "aaa\nbbb\nccc\n" - - assert ret == expected - - -class TestAssertNumpyArrayEqual(object): - - @td.skip_if_windows - def test_numpy_array_equal_message(self): - - expected = """numpy array are different - -numpy array shapes are different -\\[left\\]: \\(2,\\) -\\[right\\]: \\(3,\\)""" - - with pytest.raises(AssertionError, match=expected): - assert_numpy_array_equal(np.array([1, 2]), np.array([3, 4, 5])) - - with pytest.raises(AssertionError, match=expected): - assert_almost_equal(np.array([1, 2]), np.array([3, 4, 5])) - - # scalar comparison - expected = """Expected type """ - with pytest.raises(AssertionError, match=expected): - assert_numpy_array_equal(1, 2) - expected = """expected 2\\.00000 but got 1\\.00000, with decimal 5""" - with pytest.raises(AssertionError, match=expected): - assert_almost_equal(1, 2) - - # array / scalar array comparison - expected = """numpy array are different - -numpy array classes are different -\\[left\\]: ndarray -\\[right\\]: int""" - - with pytest.raises(AssertionError, match=expected): - # numpy_array_equal only accepts np.ndarray - assert_numpy_array_equal(np.array([1]), 1) - with pytest.raises(AssertionError, match=expected): - assert_almost_equal(np.array([1]), 1) - - # scalar / array comparison - expected = """numpy array are different - -numpy array classes are different -\\[left\\]: int -\\[right\\]: ndarray""" - - with pytest.raises(AssertionError, match=expected): - assert_numpy_array_equal(1, np.array([1])) - with pytest.raises(AssertionError, match=expected): - assert_almost_equal(1, np.array([1])) - - expected = """numpy array are different - -numpy array values are different \\(66\\.66667 %\\) -\\[left\\]: \\[nan, 2\\.0, 3\\.0\\] -\\[right\\]: \\[1\\.0, nan, 3\\.0\\]""" - - with pytest.raises(AssertionError, match=expected): - assert_numpy_array_equal(np.array([np.nan, 2, 3]), - np.array([1, np.nan, 3])) - with pytest.raises(AssertionError, match=expected): - assert_almost_equal(np.array([np.nan, 2, 3]), - np.array([1, np.nan, 3])) - - expected = """numpy array are different - -numpy array values are different \\(50\\.0 %\\) -\\[left\\]: \\[1, 2\\] -\\[right\\]: \\[1, 3\\]""" - - with pytest.raises(AssertionError, match=expected): - assert_numpy_array_equal(np.array([1, 2]), np.array([1, 3])) - with pytest.raises(AssertionError, match=expected): - assert_almost_equal(np.array([1, 2]), np.array([1, 3])) - - expected = """numpy array are different - -numpy array values are different \\(50\\.0 %\\) -\\[left\\]: \\[1\\.1, 2\\.000001\\] -\\[right\\]: \\[1\\.1, 2.0\\]""" - - with pytest.raises(AssertionError, match=expected): - assert_numpy_array_equal( - np.array([1.1, 2.000001]), np.array([1.1, 2.0])) - - # must pass - assert_almost_equal(np.array([1.1, 2.000001]), np.array([1.1, 2.0])) - - expected = """numpy array are different - -numpy array values are different \\(16\\.66667 %\\) -\\[left\\]: \\[\\[1, 2\\], \\[3, 4\\], \\[5, 6\\]\\] -\\[right\\]: \\[\\[1, 3\\], \\[3, 4\\], \\[5, 6\\]\\]""" - - with pytest.raises(AssertionError, match=expected): - assert_numpy_array_equal(np.array([[1, 2], [3, 4], [5, 6]]), - np.array([[1, 3], [3, 4], [5, 6]])) - with pytest.raises(AssertionError, match=expected): - assert_almost_equal(np.array([[1, 2], [3, 4], [5, 6]]), - np.array([[1, 3], [3, 4], [5, 6]])) - - expected = """numpy array are different - -numpy array values are different \\(25\\.0 %\\) -\\[left\\]: \\[\\[1, 2\\], \\[3, 4\\]\\] -\\[right\\]: \\[\\[1, 3\\], \\[3, 4\\]\\]""" - - with pytest.raises(AssertionError, match=expected): - assert_numpy_array_equal(np.array([[1, 2], [3, 4]]), - np.array([[1, 3], [3, 4]])) - with pytest.raises(AssertionError, match=expected): - assert_almost_equal(np.array([[1, 2], [3, 4]]), - np.array([[1, 3], [3, 4]])) - - # allow to overwrite message - expected = """Index are different - -Index shapes are different -\\[left\\]: \\(2,\\) -\\[right\\]: \\(3,\\)""" - - with pytest.raises(AssertionError, match=expected): - assert_numpy_array_equal(np.array([1, 2]), np.array([3, 4, 5]), - obj='Index') - with pytest.raises(AssertionError, match=expected): - assert_almost_equal(np.array([1, 2]), np.array([3, 4, 5]), - obj='Index') - - def test_numpy_array_equal_unicode_message(self): - # Test ensures that `assert_numpy_array_equals` raises the right - # exception when comparing np.arrays containing differing - # unicode objects (#20503) - - expected = """numpy array are different - -numpy array values are different \\(33\\.33333 %\\) -\\[left\\]: \\[á, à, ä\\] -\\[right\\]: \\[á, à, å\\]""" - - with pytest.raises(AssertionError, match=expected): - assert_numpy_array_equal(np.array([u'á', u'à', u'ä']), - np.array([u'á', u'à', u'å'])) - with pytest.raises(AssertionError, match=expected): - assert_almost_equal(np.array([u'á', u'à', u'ä']), - np.array([u'á', u'à', u'å'])) - - @td.skip_if_windows - def test_numpy_array_equal_object_message(self): - - a = np.array([pd.Timestamp('2011-01-01'), pd.Timestamp('2011-01-01')]) - b = np.array([pd.Timestamp('2011-01-01'), pd.Timestamp('2011-01-02')]) - - expected = """numpy array are different - -numpy array values are different \\(50\\.0 %\\) -\\[left\\]: \\[2011-01-01 00:00:00, 2011-01-01 00:00:00\\] -\\[right\\]: \\[2011-01-01 00:00:00, 2011-01-02 00:00:00\\]""" - - with pytest.raises(AssertionError, match=expected): - assert_numpy_array_equal(a, b) - with pytest.raises(AssertionError, match=expected): - assert_almost_equal(a, b) - - def test_numpy_array_equal_copy_flag(self): - a = np.array([1, 2, 3]) - b = a.copy() - c = a.view() - expected = r'array\(\[1, 2, 3\]\) is not array\(\[1, 2, 3\]\)' - with pytest.raises(AssertionError, match=expected): - assert_numpy_array_equal(a, b, check_same='same') - expected = r'array\(\[1, 2, 3\]\) is array\(\[1, 2, 3\]\)' - with pytest.raises(AssertionError, match=expected): - assert_numpy_array_equal(a, c, check_same='copy') - - def test_assert_almost_equal_iterable_message(self): - - expected = """Iterable are different - -Iterable length are different -\\[left\\]: 2 -\\[right\\]: 3""" - - with pytest.raises(AssertionError, match=expected): - assert_almost_equal([1, 2], [3, 4, 5]) - - expected = """Iterable are different - -Iterable values are different \\(50\\.0 %\\) -\\[left\\]: \\[1, 2\\] -\\[right\\]: \\[1, 3\\]""" - - with pytest.raises(AssertionError, match=expected): - assert_almost_equal([1, 2], [1, 3]) - - -class TestAssertIndexEqual(object): - - def test_index_equal_message(self): - - expected = """Index are different - -Index levels are different -\\[left\\]: 1, Int64Index\\(\\[1, 2, 3\\], dtype='int64'\\) -\\[right\\]: 2, MultiIndex\\(levels=\\[\\[u?'A', u?'B'\\], \\[1, 2, 3, 4\\]\\], - labels=\\[\\[0, 0, 1, 1\\], \\[0, 1, 2, 3\\]\\]\\)""" - - idx1 = pd.Index([1, 2, 3]) - idx2 = pd.MultiIndex.from_tuples([('A', 1), ('A', 2), - ('B', 3), ('B', 4)]) - with pytest.raises(AssertionError, match=expected): - assert_index_equal(idx1, idx2, exact=False) - - expected = """MultiIndex level \\[1\\] are different - -MultiIndex level \\[1\\] values are different \\(25\\.0 %\\) -\\[left\\]: Int64Index\\(\\[2, 2, 3, 4\\], dtype='int64'\\) -\\[right\\]: Int64Index\\(\\[1, 2, 3, 4\\], dtype='int64'\\)""" - - idx1 = pd.MultiIndex.from_tuples([('A', 2), ('A', 2), - ('B', 3), ('B', 4)]) - idx2 = pd.MultiIndex.from_tuples([('A', 1), ('A', 2), - ('B', 3), ('B', 4)]) - with pytest.raises(AssertionError, match=expected): - assert_index_equal(idx1, idx2) - with pytest.raises(AssertionError, match=expected): - assert_index_equal(idx1, idx2, check_exact=False) - - expected = """Index are different - -Index length are different -\\[left\\]: 3, Int64Index\\(\\[1, 2, 3\\], dtype='int64'\\) -\\[right\\]: 4, Int64Index\\(\\[1, 2, 3, 4\\], dtype='int64'\\)""" - - idx1 = pd.Index([1, 2, 3]) - idx2 = pd.Index([1, 2, 3, 4]) - with pytest.raises(AssertionError, match=expected): - assert_index_equal(idx1, idx2) - with pytest.raises(AssertionError, match=expected): - assert_index_equal(idx1, idx2, check_exact=False) - - expected = """Index are different - -Index classes are different -\\[left\\]: Int64Index\\(\\[1, 2, 3\\], dtype='int64'\\) -\\[right\\]: Float64Index\\(\\[1\\.0, 2\\.0, 3\\.0\\], dtype='float64'\\)""" - - idx1 = pd.Index([1, 2, 3]) - idx2 = pd.Index([1, 2, 3.0]) - with pytest.raises(AssertionError, match=expected): - assert_index_equal(idx1, idx2, exact=True) - with pytest.raises(AssertionError, match=expected): - assert_index_equal(idx1, idx2, exact=True, check_exact=False) - - expected = """Index are different - -Index values are different \\(33\\.33333 %\\) -\\[left\\]: Float64Index\\(\\[1.0, 2.0, 3.0], dtype='float64'\\) -\\[right\\]: Float64Index\\(\\[1.0, 2.0, 3.0000000001\\], dtype='float64'\\)""" - - idx1 = pd.Index([1, 2, 3.]) - idx2 = pd.Index([1, 2, 3.0000000001]) - with pytest.raises(AssertionError, match=expected): - assert_index_equal(idx1, idx2) - - # must success - assert_index_equal(idx1, idx2, check_exact=False) - - expected = """Index are different - -Index values are different \\(33\\.33333 %\\) -\\[left\\]: Float64Index\\(\\[1.0, 2.0, 3.0], dtype='float64'\\) -\\[right\\]: Float64Index\\(\\[1.0, 2.0, 3.0001\\], dtype='float64'\\)""" - - idx1 = pd.Index([1, 2, 3.]) - idx2 = pd.Index([1, 2, 3.0001]) - with pytest.raises(AssertionError, match=expected): - assert_index_equal(idx1, idx2) - with pytest.raises(AssertionError, match=expected): - assert_index_equal(idx1, idx2, check_exact=False) - # must success - assert_index_equal(idx1, idx2, check_exact=False, - check_less_precise=True) - - expected = """Index are different - -Index values are different \\(33\\.33333 %\\) -\\[left\\]: Int64Index\\(\\[1, 2, 3\\], dtype='int64'\\) -\\[right\\]: Int64Index\\(\\[1, 2, 4\\], dtype='int64'\\)""" - - idx1 = pd.Index([1, 2, 3]) - idx2 = pd.Index([1, 2, 4]) - with pytest.raises(AssertionError, match=expected): - assert_index_equal(idx1, idx2) - with pytest.raises(AssertionError, match=expected): - assert_index_equal(idx1, idx2, check_less_precise=True) - - expected = """MultiIndex level \\[1\\] are different - -MultiIndex level \\[1\\] values are different \\(25\\.0 %\\) -\\[left\\]: Int64Index\\(\\[2, 2, 3, 4\\], dtype='int64'\\) -\\[right\\]: Int64Index\\(\\[1, 2, 3, 4\\], dtype='int64'\\)""" - - idx1 = pd.MultiIndex.from_tuples([('A', 2), ('A', 2), - ('B', 3), ('B', 4)]) - idx2 = pd.MultiIndex.from_tuples([('A', 1), ('A', 2), - ('B', 3), ('B', 4)]) - with pytest.raises(AssertionError, match=expected): - assert_index_equal(idx1, idx2) - with pytest.raises(AssertionError, match=expected): - assert_index_equal(idx1, idx2, check_exact=False) - - def test_index_equal_metadata_message(self): - - expected = """Index are different - -Attribute "names" are different -\\[left\\]: \\[None\\] -\\[right\\]: \\[u?'x'\\]""" - - idx1 = pd.Index([1, 2, 3]) - idx2 = pd.Index([1, 2, 3], name='x') - with pytest.raises(AssertionError, match=expected): - assert_index_equal(idx1, idx2) - - # same name, should pass - assert_index_equal(pd.Index([1, 2, 3], name=np.nan), - pd.Index([1, 2, 3], name=np.nan)) - assert_index_equal(pd.Index([1, 2, 3], name=pd.NaT), - pd.Index([1, 2, 3], name=pd.NaT)) - - expected = """Index are different - -Attribute "names" are different -\\[left\\]: \\[nan\\] -\\[right\\]: \\[NaT\\]""" - - idx1 = pd.Index([1, 2, 3], name=np.nan) - idx2 = pd.Index([1, 2, 3], name=pd.NaT) - with pytest.raises(AssertionError, match=expected): - assert_index_equal(idx1, idx2) - - def test_categorical_index_equality(self): - expected = """Index are different - -Attribute "dtype" are different -\\[left\\]: CategoricalDtype\\(categories=\\[u?'a', u?'b'\\], ordered=False\\) -\\[right\\]: CategoricalDtype\\(categories=\\[u?'a', u?'b', u?'c'\\], \ -ordered=False\\)""" - - with pytest.raises(AssertionError, match=expected): - assert_index_equal(pd.Index(pd.Categorical(['a', 'b'])), - pd.Index(pd.Categorical(['a', 'b'], - categories=['a', 'b', 'c']))) - - def test_categorical_index_equality_relax_categories_check(self): - assert_index_equal(pd.Index(pd.Categorical(['a', 'b'])), - pd.Index(pd.Categorical(['a', 'b'], - categories=['a', 'b', 'c'])), - check_categorical=False) - - -class TestAssertSeriesEqual(object): - - def _assert_equal(self, x, y, **kwargs): - assert_series_equal(x, y, **kwargs) - assert_series_equal(y, x, **kwargs) - - def _assert_not_equal(self, a, b, **kwargs): - pytest.raises(AssertionError, assert_series_equal, a, b, **kwargs) - pytest.raises(AssertionError, assert_series_equal, b, a, **kwargs) - - def test_equal(self): - self._assert_equal(Series(range(3)), Series(range(3))) - self._assert_equal(Series(list('abc')), Series(list('abc'))) - self._assert_equal(Series(list(u'áàä')), Series(list(u'áàä'))) - - def test_not_equal(self): - self._assert_not_equal(Series(range(3)), Series(range(3)) + 1) - self._assert_not_equal(Series(list('abc')), Series(list('xyz'))) - self._assert_not_equal(Series(list(u'áàä')), Series(list(u'éèë'))) - self._assert_not_equal(Series(list(u'áàä')), Series(list(b'aaa'))) - self._assert_not_equal(Series(range(3)), Series(range(4))) - self._assert_not_equal( - Series(range(3)), Series( - range(3), dtype='float64')) - self._assert_not_equal( - Series(range(3)), Series( - range(3), index=[1, 2, 4])) - - # ATM meta data is not checked in assert_series_equal - # self._assert_not_equal(Series(range(3)),Series(range(3),name='foo'),check_names=True) - - def test_less_precise(self): - s1 = Series([0.12345], dtype='float64') - s2 = Series([0.12346], dtype='float64') - - pytest.raises(AssertionError, assert_series_equal, s1, s2) - self._assert_equal(s1, s2, check_less_precise=True) - for i in range(4): - self._assert_equal(s1, s2, check_less_precise=i) - pytest.raises(AssertionError, assert_series_equal, s1, s2, 10) - - s1 = Series([0.12345], dtype='float32') - s2 = Series([0.12346], dtype='float32') - - pytest.raises(AssertionError, assert_series_equal, s1, s2) - self._assert_equal(s1, s2, check_less_precise=True) - for i in range(4): - self._assert_equal(s1, s2, check_less_precise=i) - pytest.raises(AssertionError, assert_series_equal, s1, s2, 10) - - # even less than less precise - s1 = Series([0.1235], dtype='float32') - s2 = Series([0.1236], dtype='float32') - - pytest.raises(AssertionError, assert_series_equal, s1, s2) - pytest.raises(AssertionError, assert_series_equal, s1, s2, True) - - def test_index_dtype(self): - df1 = DataFrame.from_records( - {'a': [1, 2], 'c': ['l1', 'l2']}, index=['a']) - df2 = DataFrame.from_records( - {'a': [1.0, 2.0], 'c': ['l1', 'l2']}, index=['a']) - self._assert_not_equal(df1.c, df2.c, check_index_type=True) - - def test_multiindex_dtype(self): - df1 = DataFrame.from_records( - {'a': [1, 2], 'b': [2.1, 1.5], - 'c': ['l1', 'l2']}, index=['a', 'b']) - df2 = DataFrame.from_records( - {'a': [1.0, 2.0], 'b': [2.1, 1.5], - 'c': ['l1', 'l2']}, index=['a', 'b']) - self._assert_not_equal(df1.c, df2.c, check_index_type=True) - - def test_series_equal_message(self): - - expected = """Series are different - -Series length are different -\\[left\\]: 3, RangeIndex\\(start=0, stop=3, step=1\\) -\\[right\\]: 4, RangeIndex\\(start=0, stop=4, step=1\\)""" - - with pytest.raises(AssertionError, match=expected): - assert_series_equal(pd.Series([1, 2, 3]), pd.Series([1, 2, 3, 4])) - - expected = """Series are different - -Series values are different \\(33\\.33333 %\\) -\\[left\\]: \\[1, 2, 3\\] -\\[right\\]: \\[1, 2, 4\\]""" - - with pytest.raises(AssertionError, match=expected): - assert_series_equal(pd.Series([1, 2, 3]), pd.Series([1, 2, 4])) - with pytest.raises(AssertionError, match=expected): - assert_series_equal(pd.Series([1, 2, 3]), pd.Series([1, 2, 4]), - check_less_precise=True) - - def test_categorical_series_equality(self): - expected = """Attributes are different - -Attribute "dtype" are different -\\[left\\]: CategoricalDtype\\(categories=\\[u?'a', u?'b'\\], ordered=False\\) -\\[right\\]: CategoricalDtype\\(categories=\\[u?'a', u?'b', u?'c'\\], \ -ordered=False\\)""" - - with pytest.raises(AssertionError, match=expected): - assert_series_equal(pd.Series(pd.Categorical(['a', 'b'])), - pd.Series(pd.Categorical(['a', 'b'], - categories=['a', 'b', 'c']))) - - def test_categorical_series_equality_relax_categories_check(self): - assert_series_equal(pd.Series(pd.Categorical(['a', 'b'])), - pd.Series(pd.Categorical(['a', 'b'], - categories=['a', 'b', 'c'])), - check_categorical=False) - - -class TestAssertFrameEqual(object): - - def _assert_equal(self, x, y, **kwargs): - assert_frame_equal(x, y, **kwargs) - assert_frame_equal(y, x, **kwargs) - - def _assert_not_equal(self, a, b, **kwargs): - pytest.raises(AssertionError, assert_frame_equal, a, b, **kwargs) - pytest.raises(AssertionError, assert_frame_equal, b, a, **kwargs) - - def test_equal_with_different_row_order(self): - # check_like=True ignores row-column orderings - df1 = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]}, - index=['a', 'b', 'c']) - df2 = pd.DataFrame({'A': [3, 2, 1], 'B': [6, 5, 4]}, - index=['c', 'b', 'a']) - - self._assert_equal(df1, df2, check_like=True) - self._assert_not_equal(df1, df2) - - def test_not_equal_with_different_shape(self): - self._assert_not_equal(pd.DataFrame({'A': [1, 2, 3]}), - pd.DataFrame({'A': [1, 2, 3, 4]})) - - def test_index_dtype(self): - df1 = DataFrame.from_records( - {'a': [1, 2], 'c': ['l1', 'l2']}, index=['a']) - df2 = DataFrame.from_records( - {'a': [1.0, 2.0], 'c': ['l1', 'l2']}, index=['a']) - self._assert_not_equal(df1, df2, check_index_type=True) - - def test_multiindex_dtype(self): - df1 = DataFrame.from_records( - {'a': [1, 2], 'b': [2.1, 1.5], - 'c': ['l1', 'l2']}, index=['a', 'b']) - df2 = DataFrame.from_records( - {'a': [1.0, 2.0], 'b': [2.1, 1.5], - 'c': ['l1', 'l2']}, index=['a', 'b']) - self._assert_not_equal(df1, df2, check_index_type=True) - - def test_empty_dtypes(self): - df1 = pd.DataFrame(columns=["col1", "col2"]) - df1["col1"] = df1["col1"].astype('int64') - df2 = pd.DataFrame(columns=["col1", "col2"]) - self._assert_equal(df1, df2, check_dtype=False) - self._assert_not_equal(df1, df2, check_dtype=True) - - def test_frame_equal_message(self): - - expected = """DataFrame are different - -DataFrame shape mismatch -\\[left\\]: \\(3, 2\\) -\\[right\\]: \\(3, 1\\)""" - - with pytest.raises(AssertionError, match=expected): - assert_frame_equal(pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]}), - pd.DataFrame({'A': [1, 2, 3]})) - - expected = """DataFrame\\.index are different - -DataFrame\\.index values are different \\(33\\.33333 %\\) -\\[left\\]: Index\\(\\[u?'a', u?'b', u?'c'\\], dtype='object'\\) -\\[right\\]: Index\\(\\[u?'a', u?'b', u?'d'\\], dtype='object'\\)""" - - with pytest.raises(AssertionError, match=expected): - assert_frame_equal(pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]}, - index=['a', 'b', 'c']), - pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]}, - index=['a', 'b', 'd'])) - - expected = """DataFrame\\.columns are different - -DataFrame\\.columns values are different \\(50\\.0 %\\) -\\[left\\]: Index\\(\\[u?'A', u?'B'\\], dtype='object'\\) -\\[right\\]: Index\\(\\[u?'A', u?'b'\\], dtype='object'\\)""" - - with pytest.raises(AssertionError, match=expected): - assert_frame_equal(pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]}, - index=['a', 'b', 'c']), - pd.DataFrame({'A': [1, 2, 3], 'b': [4, 5, 6]}, - index=['a', 'b', 'c'])) - - expected = """DataFrame\\.iloc\\[:, 1\\] are different - -DataFrame\\.iloc\\[:, 1\\] values are different \\(33\\.33333 %\\) -\\[left\\]: \\[4, 5, 6\\] -\\[right\\]: \\[4, 5, 7\\]""" - - with pytest.raises(AssertionError, match=expected): - assert_frame_equal(pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]}), - pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 7]})) - - with pytest.raises(AssertionError, match=expected): - assert_frame_equal(pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]}), - pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 7]}), - by_blocks=True) - - def test_frame_equal_message_unicode(self): - # Test ensures that `assert_frame_equals` raises the right - # exception when comparing DataFrames containing differing - # unicode objects (#20503) - - expected = """DataFrame\\.iloc\\[:, 1\\] are different - -DataFrame\\.iloc\\[:, 1\\] values are different \\(33\\.33333 %\\) -\\[left\\]: \\[é, è, ë\\] -\\[right\\]: \\[é, è, e̊\\]""" - - with pytest.raises(AssertionError, match=expected): - assert_frame_equal(pd.DataFrame({'A': [u'á', u'à', u'ä'], - 'E': [u'é', u'è', u'ë']}), - pd.DataFrame({'A': [u'á', u'à', u'ä'], - 'E': [u'é', u'è', u'e̊']})) - - with pytest.raises(AssertionError, match=expected): - assert_frame_equal(pd.DataFrame({'A': [u'á', u'à', u'ä'], - 'E': [u'é', u'è', u'ë']}), - pd.DataFrame({'A': [u'á', u'à', u'ä'], - 'E': [u'é', u'è', u'e̊']}), - by_blocks=True) - - expected = """DataFrame\\.iloc\\[:, 0\\] are different - -DataFrame\\.iloc\\[:, 0\\] values are different \\(100\\.0 %\\) -\\[left\\]: \\[á, à, ä\\] -\\[right\\]: \\[a, a, a\\]""" - - with pytest.raises(AssertionError, match=expected): - assert_frame_equal(pd.DataFrame({'A': [u'á', u'à', u'ä'], - 'E': [u'é', u'è', u'ë']}), - pd.DataFrame({'A': ['a', 'a', 'a'], - 'E': ['e', 'e', 'e']})) - - with pytest.raises(AssertionError, match=expected): - assert_frame_equal(pd.DataFrame({'A': [u'á', u'à', u'ä'], - 'E': [u'é', u'è', u'ë']}), - pd.DataFrame({'A': ['a', 'a', 'a'], - 'E': ['e', 'e', 'e']}), - by_blocks=True) - - -class TestAssertCategoricalEqual(object): - - def test_categorical_equal_message(self): - - expected = """Categorical\\.categories are different - -Categorical\\.categories values are different \\(25\\.0 %\\) -\\[left\\]: Int64Index\\(\\[1, 2, 3, 4\\], dtype='int64'\\) -\\[right\\]: Int64Index\\(\\[1, 2, 3, 5\\], dtype='int64'\\)""" - - a = pd.Categorical([1, 2, 3, 4]) - b = pd.Categorical([1, 2, 3, 5]) - with pytest.raises(AssertionError, match=expected): - tm.assert_categorical_equal(a, b) - - expected = """Categorical\\.codes are different - -Categorical\\.codes values are different \\(50\\.0 %\\) -\\[left\\]: \\[0, 1, 3, 2\\] -\\[right\\]: \\[0, 1, 2, 3\\]""" - - a = pd.Categorical([1, 2, 4, 3], categories=[1, 2, 3, 4]) - b = pd.Categorical([1, 2, 3, 4], categories=[1, 2, 3, 4]) - with pytest.raises(AssertionError, match=expected): - tm.assert_categorical_equal(a, b) - - expected = """Categorical are different - -Attribute "ordered" are different -\\[left\\]: False -\\[right\\]: True""" - - a = pd.Categorical([1, 2, 3, 4], ordered=False) - b = pd.Categorical([1, 2, 3, 4], ordered=True) - with pytest.raises(AssertionError, match=expected): - tm.assert_categorical_equal(a, b) - - -class TestAssertIntervalArrayEqual(object): - def test_interval_array_equal_message(self): - a = pd.interval_range(0, periods=4).values - b = pd.interval_range(1, periods=4).values - - msg = textwrap.dedent("""\ - IntervalArray.left are different - - IntervalArray.left values are different \\(100.0 %\\) - \\[left\\]: Int64Index\\(\\[0, 1, 2, 3\\], dtype='int64'\\) - \\[right\\]: Int64Index\\(\\[1, 2, 3, 4\\], dtype='int64'\\)""") - with pytest.raises(AssertionError, match=msg): - tm.assert_interval_array_equal(a, b) - - -class TestAssertExtensionArrayEqual(object): - - def test_check_exact(self): - # GH 23709 - left = SparseArray([-0.17387645482451206, 0.3414148016424936]) - right = SparseArray([-0.17387645482451206, 0.3414148016424937]) - - # passes with check_exact=False (should be default) - assert_extension_array_equal(left, right) - assert_extension_array_equal(left, right, check_exact=False) - - # raises with check_exact=True - msg = textwrap.dedent("""\ - ExtensionArray are different - - ExtensionArray values are different \\(50\\.0 %\\) - \\[left\\]: \\[-0\\.17387645482.*, 0\\.341414801642.*\\] - \\[right\\]: \\[-0\\.17387645482.*, 0\\.341414801642.*\\]""") - with pytest.raises(AssertionError, match=msg): - assert_extension_array_equal(left, right, check_exact=True) - - @pytest.mark.parametrize('check_less_precise', [True, 0, 1, 2, 3, 4]) - def test_check_less_precise_passes(self, check_less_precise): - left = SparseArray([0.5, 0.123456]) - right = SparseArray([0.5, 0.123457]) - assert_extension_array_equal( - left, right, check_less_precise=check_less_precise) - - @pytest.mark.parametrize('check_less_precise', [False, 5, 6, 7, 8, 9]) - def test_check_less_precise_fails(self, check_less_precise): - left = SparseArray([0.5, 0.123456]) - right = SparseArray([0.5, 0.123457]) - - msg = textwrap.dedent("""\ - ExtensionArray are different - - ExtensionArray values are different \\(50\\.0 %\\) - \\[left\\]: \\[0\\.5, 0\\.123456\\] - \\[right\\]: \\[0\\.5, 0\\.123457\\]""") - with pytest.raises(AssertionError, match=msg): - assert_extension_array_equal( - left, right, check_less_precise=check_less_precise) - - def test_check_dtype(self): - left = SparseArray(np.arange(5, dtype='int64')) - right = SparseArray(np.arange(5, dtype='int32')) - - # passes with check_dtype=False - assert_extension_array_equal(left, right, check_dtype=False) - - # raises with check_dtype=True - msg = textwrap.dedent("""\ - ExtensionArray are different - - Attribute "dtype" are different - \\[left\\]: Sparse\\[int64, 0\\] - \\[right\\]: Sparse\\[int32, 0\\]""") - with pytest.raises(AssertionError, match=msg): - assert_extension_array_equal(left, right, check_dtype=True) - - def test_missing_values(self): - left = SparseArray([np.nan, 1, 2, np.nan]) - right = SparseArray([np.nan, 1, 2, 3]) - - msg = textwrap.dedent("""\ - ExtensionArray NA mask are different - - ExtensionArray NA mask values are different \\(25\\.0 %\\) - \\[left\\]: \\[True, False, False, True\\] - \\[right\\]: \\[True, False, False, False\\]""") - with pytest.raises(AssertionError, match=msg): - assert_extension_array_equal(left, right) - - def test_non_extension_array(self): - numpy_array = np.arange(5) - extension_array = SparseArray(np.arange(5)) - - msg = 'left is not an ExtensionArray' - with pytest.raises(AssertionError, match=msg): - assert_extension_array_equal(numpy_array, extension_array) - - msg = 'right is not an ExtensionArray' - with pytest.raises(AssertionError, match=msg): - assert_extension_array_equal(extension_array, numpy_array) - - -class TestRNGContext(object): - - def test_RNGContext(self): - expected0 = 1.764052345967664 - expected1 = 1.6243453636632417 - - with RNGContext(0): - with RNGContext(1): - assert np.random.randn() == expected1 - assert np.random.randn() == expected0 - - -def test_datapath_missing(datapath, request): - if not request.config.getoption("--strict-data-files"): - pytest.skip("Need to set '--strict-data-files'") - - with pytest.raises(ValueError): - datapath('not_a_file') - - result = datapath('data', 'iris.csv') - expected = os.path.join( - os.path.dirname(os.path.dirname(__file__)), - 'data', - 'iris.csv' - ) - - assert result == expected - - -def test_create_temp_directory(): - with tm.ensure_clean_dir() as path: - assert os.path.exists(path) - assert os.path.isdir(path) - assert not os.path.exists(path) - - -def test_assert_raises_regex_deprecated(): - # see gh-23592 - - with tm.assert_produces_warning(FutureWarning): - msg = "Not equal!" - - with tm.assert_raises_regex(AssertionError, msg): - assert 1 == 2, msg diff --git a/pandas/tests/util/test_util.py b/pandas/tests/util/test_util.py index e4b2f0a75051a..f9282ff15612d 100644 --- a/pandas/tests/util/test_util.py +++ b/pandas/tests/util/test_util.py @@ -1,6 +1,11 @@ # -*- coding: utf-8 -*- +import os +import sys + import pytest +import pandas.compat as compat +from pandas.compat import raise_with_traceback from pandas.util._decorators import deprecate_kwarg, make_signature from pandas.util._validators import validate_kwargs @@ -9,19 +14,19 @@ def test_rands(): r = tm.rands(10) - assert(len(r) == 10) + assert len(r) == 10 def test_rands_array_1d(): arr = tm.rands_array(5, size=10) - assert(arr.shape == (10,)) - assert(len(arr[0]) == 5) + assert arr.shape == (10,) + assert len(arr[0]) == 5 def test_rands_array_2d(): arr = tm.rands_array(7, size=(10, 10)) - assert(arr.shape == (10, 10)) - assert(len(arr[1, 1]) == 7) + assert arr.shape == (10, 10) + assert len(arr[1, 1]) == 7 def test_numpy_err_state_is_default(): @@ -48,3 +53,76 @@ def test_numpy_err_state_is_default(): def test_make_signature(func, expected): # see gh-17608 assert make_signature(func) == expected + + +def test_raise_with_traceback(): + with pytest.raises(LookupError, match="error_text"): + try: + raise ValueError("THIS IS AN ERROR") + except ValueError: + e = LookupError("error_text") + raise_with_traceback(e) + + with pytest.raises(LookupError, match="error_text"): + try: + raise ValueError("This is another error") + except ValueError: + e = LookupError("error_text") + _, _, traceback = sys.exc_info() + raise_with_traceback(e, traceback) + + +def test_convert_rows_list_to_csv_str(): + rows_list = ["aaa", "bbb", "ccc"] + ret = tm.convert_rows_list_to_csv_str(rows_list) + + if compat.is_platform_windows(): + expected = "aaa\r\nbbb\r\nccc\r\n" + else: + expected = "aaa\nbbb\nccc\n" + + assert ret == expected + + +def test_create_temp_directory(): + with tm.ensure_clean_dir() as path: + assert os.path.exists(path) + assert os.path.isdir(path) + assert not os.path.exists(path) + + +def test_assert_raises_regex_deprecated(): + # see gh-23592 + + with tm.assert_produces_warning(FutureWarning): + msg = "Not equal!" + + with tm.assert_raises_regex(AssertionError, msg): + assert 1 == 2, msg + + +def test_datapath_missing(datapath, request): + if not request.config.getoption("--strict-data-files"): + pytest.skip("Need to set '--strict-data-files'") + + with pytest.raises(ValueError, match="Could not find file"): + datapath("not_a_file") + + args = ("data", "iris.csv") + + result = datapath(*args) + expected = os.path.join(os.path.dirname(os.path.dirname(__file__)), *args) + + assert result == expected + + +def test_rng_context(): + import numpy as np + + expected0 = 1.764052345967664 + expected1 = 1.6243453636632417 + + with tm.RNGContext(0): + with tm.RNGContext(1): + assert np.random.randn() == expected1 + assert np.random.randn() == expected0 From bab279ab76125e2622cc8dc9a8e6521e88a5a1d4 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Fri, 21 Dec 2018 11:09:56 -0600 Subject: [PATCH 02/15] COMPAT: Add keepdims and friends to validation (#24356) --- doc/source/whatsnew/v0.24.0.rst | 1 + pandas/compat/numpy/function.py | 23 ++++++++++++++--- pandas/core/generic.py | 12 +++++++-- pandas/tests/series/test_analytics.py | 36 +++++++++++++++++++++++++++ 4 files changed, 67 insertions(+), 5 deletions(-) diff --git a/doc/source/whatsnew/v0.24.0.rst b/doc/source/whatsnew/v0.24.0.rst index 78f864f0dcb73..7da1c1aeef348 100644 --- a/doc/source/whatsnew/v0.24.0.rst +++ b/doc/source/whatsnew/v0.24.0.rst @@ -1400,6 +1400,7 @@ Numeric - Added ``log10`` to the list of supported functions in :meth:`DataFrame.eval` (:issue:`24139`) - Logical operations ``&, |, ^`` between :class:`Series` and :class:`Index` will no longer raise ``ValueError`` (:issue:`22092`) - Checking PEP 3141 numbers in :func:`~pandas.api.types.is_scalar` function returns ``True`` (:issue:`22903`) +- Reduction methods like :meth:`Series.sum` now accept the default value of ``keepdims=False`` when called from a NumPy ufunc, rather than raising a ``TypeError``. Full support for ``keepdims`` has not been implemented (:issue:`24356`). Conversion ^^^^^^^^^^ diff --git a/pandas/compat/numpy/function.py b/pandas/compat/numpy/function.py index 30fdeca35faf3..417ddd0d8af17 100644 --- a/pandas/compat/numpy/function.py +++ b/pandas/compat/numpy/function.py @@ -189,15 +189,16 @@ def validate_cum_func_with_skipna(skipna, args, kwargs, name): ALLANY_DEFAULTS = OrderedDict() ALLANY_DEFAULTS['dtype'] = None ALLANY_DEFAULTS['out'] = None +ALLANY_DEFAULTS['keepdims'] = False validate_all = CompatValidator(ALLANY_DEFAULTS, fname='all', method='both', max_fname_arg_count=1) validate_any = CompatValidator(ALLANY_DEFAULTS, fname='any', method='both', max_fname_arg_count=1) -LOGICAL_FUNC_DEFAULTS = dict(out=None) +LOGICAL_FUNC_DEFAULTS = dict(out=None, keepdims=False) validate_logical_func = CompatValidator(LOGICAL_FUNC_DEFAULTS, method='kwargs') -MINMAX_DEFAULTS = dict(out=None) +MINMAX_DEFAULTS = dict(out=None, keepdims=False) validate_min = CompatValidator(MINMAX_DEFAULTS, fname='min', method='both', max_fname_arg_count=1) validate_max = CompatValidator(MINMAX_DEFAULTS, fname='max', @@ -225,16 +226,32 @@ def validate_cum_func_with_skipna(skipna, args, kwargs, name): STAT_FUNC_DEFAULTS = OrderedDict() STAT_FUNC_DEFAULTS['dtype'] = None STAT_FUNC_DEFAULTS['out'] = None + +PROD_DEFAULTS = SUM_DEFAULTS = STAT_FUNC_DEFAULTS.copy() +SUM_DEFAULTS['keepdims'] = False +SUM_DEFAULTS['initial'] = None + +MEDIAN_DEFAULTS = STAT_FUNC_DEFAULTS.copy() +MEDIAN_DEFAULTS['overwrite_input'] = False +MEDIAN_DEFAULTS['keepdims'] = False + +STAT_FUNC_DEFAULTS['keepdims'] = False + validate_stat_func = CompatValidator(STAT_FUNC_DEFAULTS, method='kwargs') -validate_sum = CompatValidator(STAT_FUNC_DEFAULTS, fname='sort', +validate_sum = CompatValidator(SUM_DEFAULTS, fname='sum', method='both', max_fname_arg_count=1) +validate_prod = CompatValidator(PROD_DEFAULTS, fname="prod", + method="both", max_fname_arg_count=1) validate_mean = CompatValidator(STAT_FUNC_DEFAULTS, fname='mean', method='both', max_fname_arg_count=1) +validate_median = CompatValidator(MEDIAN_DEFAULTS, fname='median', + method='both', max_fname_arg_count=1) STAT_DDOF_FUNC_DEFAULTS = OrderedDict() STAT_DDOF_FUNC_DEFAULTS['dtype'] = None STAT_DDOF_FUNC_DEFAULTS['out'] = None +STAT_DDOF_FUNC_DEFAULTS['keepdims'] = False validate_stat_ddof_func = CompatValidator(STAT_DDOF_FUNC_DEFAULTS, method='kwargs') diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 6eb6bc124c80a..c1a53e1e97803 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -10834,7 +10834,12 @@ def _make_min_count_stat_function(cls, name, name1, name2, axis_descr, desc, def stat_func(self, axis=None, skipna=None, level=None, numeric_only=None, min_count=0, **kwargs): - nv.validate_stat_func(tuple(), kwargs, fname=name) + if name == 'sum': + nv.validate_sum(tuple(), kwargs) + elif name == 'prod': + nv.validate_prod(tuple(), kwargs) + else: + nv.validate_stat_func(tuple(), kwargs, fname=name) if skipna is None: skipna = True if axis is None: @@ -10855,7 +10860,10 @@ def _make_stat_function(cls, name, name1, name2, axis_descr, desc, f, @Appender(_num_doc) def stat_func(self, axis=None, skipna=None, level=None, numeric_only=None, **kwargs): - nv.validate_stat_func(tuple(), kwargs, fname=name) + if name == 'median': + nv.validate_median(tuple(), kwargs) + else: + nv.validate_stat_func(tuple(), kwargs, fname=name) if skipna is None: skipna = True if axis is None: diff --git a/pandas/tests/series/test_analytics.py b/pandas/tests/series/test_analytics.py index 81d60aba44b0f..0d8804dba83c1 100644 --- a/pandas/tests/series/test_analytics.py +++ b/pandas/tests/series/test_analytics.py @@ -1641,6 +1641,42 @@ def test_value_counts_categorical_not_ordered(self): tm.assert_series_equal(s.value_counts(normalize=True), exp) tm.assert_series_equal(idx.value_counts(normalize=True), exp) + @pytest.mark.parametrize("func", [np.any, np.all]) + @pytest.mark.parametrize("kwargs", [ + dict(keepdims=True), + dict(out=object()), + ]) + @td.skip_if_np_lt_115 + def test_validate_any_all_out_keepdims_raises(self, kwargs, func): + s = pd.Series([1, 2]) + param = list(kwargs)[0] + name = func.__name__ + + msg = "the '{}' parameter .* {}".format(param, name) + with pytest.raises(ValueError, match=msg): + func(s, **kwargs) + + @td.skip_if_np_lt_115 + def test_validate_sum_initial(self): + s = pd.Series([1, 2]) + with pytest.raises(ValueError, match="the 'initial' .* sum"): + np.sum(s, initial=10) + + def test_validate_median_initial(self): + s = pd.Series([1, 2]) + with pytest.raises(ValueError, + match="the 'overwrite_input' .* median"): + # It seems like np.median doesn't dispatch, so we use the + # method instead of the ufunc. + s.median(overwrite_input=True) + + @td.skip_if_np_lt_115 + def test_validate_stat_keepdims(self): + s = pd.Series([1, 2]) + with pytest.raises(ValueError, + match="the 'keepdims'"): + np.sum(s, keepdims=True) + main_dtypes = [ 'datetime', From 0bb3772219c5532f4c9703e4e4f2a2e844ee55d3 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Fri, 21 Dec 2018 11:11:49 -0600 Subject: [PATCH 03/15] BUG/PERF: Sparse get_dummies uses concat (#24372) --- asv_bench/benchmarks/join_merge.py | 2 +- asv_bench/benchmarks/panel_ctor.py | 12 ++++++------ asv_bench/benchmarks/reindex.py | 4 ++-- asv_bench/benchmarks/timedelta.py | 9 +++++---- asv_bench/benchmarks/timestamp.py | 7 ++++--- doc/source/whatsnew/v0.24.0.rst | 4 +++- pandas/core/dtypes/concat.py | 8 ++++---- pandas/core/reshape/reshape.py | 23 ++++++++++++++++------- pandas/tests/sparse/test_reshape.py | 6 +++++- 9 files changed, 46 insertions(+), 29 deletions(-) diff --git a/asv_bench/benchmarks/join_merge.py b/asv_bench/benchmarks/join_merge.py index 88a59fea375ea..a1cdb00260fc4 100644 --- a/asv_bench/benchmarks/join_merge.py +++ b/asv_bench/benchmarks/join_merge.py @@ -50,7 +50,7 @@ def setup(self, axis): self.empty_right = [df, DataFrame()] def time_concat_series(self, axis): - concat(self.series, axis=axis) + concat(self.series, axis=axis, sort=False) def time_concat_small_frames(self, axis): concat(self.small_frames, axis=axis) diff --git a/asv_bench/benchmarks/panel_ctor.py b/asv_bench/benchmarks/panel_ctor.py index 47b3ad612f9b1..627705284481b 100644 --- a/asv_bench/benchmarks/panel_ctor.py +++ b/asv_bench/benchmarks/panel_ctor.py @@ -1,7 +1,7 @@ import warnings from datetime import datetime, timedelta -from pandas import DataFrame, Panel, DatetimeIndex, date_range +from pandas import DataFrame, Panel, date_range class DifferentIndexes(object): @@ -23,9 +23,9 @@ def time_from_dict(self): class SameIndexes(object): def setup(self): - idx = DatetimeIndex(start=datetime(1990, 1, 1), - end=datetime(2012, 1, 1), - freq='D') + idx = date_range(start=datetime(1990, 1, 1), + end=datetime(2012, 1, 1), + freq='D') df = DataFrame({'a': 0, 'b': 1, 'c': 2}, index=idx) self.data_frames = dict(enumerate([df] * 100)) @@ -40,10 +40,10 @@ def setup(self): start = datetime(1990, 1, 1) end = datetime(2012, 1, 1) df1 = DataFrame({'a': 0, 'b': 1, 'c': 2}, - index=DatetimeIndex(start=start, end=end, freq='D')) + index=date_range(start=start, end=end, freq='D')) end += timedelta(days=1) df2 = DataFrame({'a': 0, 'b': 1, 'c': 2}, - index=DatetimeIndex(start=start, end=end, freq='D')) + index=date_range(start=start, end=end, freq='D')) dfs = [df1] * 50 + [df2] * 50 self.data_frames = dict(enumerate(dfs)) diff --git a/asv_bench/benchmarks/reindex.py b/asv_bench/benchmarks/reindex.py index 576dc495eb984..fb47fa81d8dfd 100644 --- a/asv_bench/benchmarks/reindex.py +++ b/asv_bench/benchmarks/reindex.py @@ -1,6 +1,6 @@ import numpy as np import pandas.util.testing as tm -from pandas import (DataFrame, Series, DatetimeIndex, MultiIndex, Index, +from pandas import (DataFrame, Series, MultiIndex, Index, date_range) from .pandas_vb_common import lib @@ -8,7 +8,7 @@ class Reindex(object): def setup(self): - rng = DatetimeIndex(start='1/1/1970', periods=10000, freq='1min') + rng = date_range(start='1/1/1970', periods=10000, freq='1min') self.df = DataFrame(np.random.rand(10000, 10), index=rng, columns=range(10)) self.df['foo'] = 'bar' diff --git a/asv_bench/benchmarks/timedelta.py b/asv_bench/benchmarks/timedelta.py index 7ee73fb7ac7b6..0cfbbd536bc8b 100644 --- a/asv_bench/benchmarks/timedelta.py +++ b/asv_bench/benchmarks/timedelta.py @@ -1,8 +1,9 @@ import datetime import numpy as np -from pandas import Series, timedelta_range, to_timedelta, Timestamp, \ - Timedelta, TimedeltaIndex, DataFrame + +from pandas import ( + DataFrame, Series, Timedelta, Timestamp, timedelta_range, to_timedelta) class TimedeltaConstructor(object): @@ -122,8 +123,8 @@ def time_timedelta_nanoseconds(self, series): class TimedeltaIndexing(object): def setup(self): - self.index = TimedeltaIndex(start='1985', periods=1000, freq='D') - self.index2 = TimedeltaIndex(start='1986', periods=1000, freq='D') + self.index = timedelta_range(start='1985', periods=1000, freq='D') + self.index2 = timedelta_range(start='1986', periods=1000, freq='D') self.series = Series(range(1000), index=self.index) self.timedelta = self.index[500] diff --git a/asv_bench/benchmarks/timestamp.py b/asv_bench/benchmarks/timestamp.py index 64f46fe378e53..4c1d6e8533408 100644 --- a/asv_bench/benchmarks/timestamp.py +++ b/asv_bench/benchmarks/timestamp.py @@ -1,8 +1,9 @@ import datetime -from pandas import Timestamp -import pytz import dateutil +import pytz + +from pandas import Timestamp class TimestampConstruction(object): @@ -46,7 +47,7 @@ def time_dayofweek(self, tz, freq): self.ts.dayofweek def time_weekday_name(self, tz, freq): - self.ts.weekday_name + self.ts.day_name def time_dayofyear(self, tz, freq): self.ts.dayofyear diff --git a/doc/source/whatsnew/v0.24.0.rst b/doc/source/whatsnew/v0.24.0.rst index 7da1c1aeef348..fd41330168dd1 100644 --- a/doc/source/whatsnew/v0.24.0.rst +++ b/doc/source/whatsnew/v0.24.0.rst @@ -647,7 +647,7 @@ changes were made: * The ``out`` and ``mode`` parameters are now longer accepted (previously, this raised if they were specified). * Passing a scalar for ``indices`` is no longer allowed. -- The result of concatenating a mix of sparse and dense Series is a Series with sparse values, rather than a ``SparseSeries``. +- The result of :func:`concat` with a mix of sparse and dense Series is a Series with sparse values, rather than a ``SparseSeries``. - ``SparseDataFrame.combine`` and ``DataFrame.combine_first`` no longer supports combining a sparse column with a dense column while preserving the sparse subtype. The result will be an object-dtype SparseArray. - Setting :attr:`SparseArray.fill_value` to a fill value with a different dtype is now allowed. - ``DataFrame[column]`` is now a :class:`Series` with sparse values, rather than a :class:`SparseSeries`, when slicing a single column with sparse values (:issue:`23559`). @@ -1104,6 +1104,7 @@ Other API Changes - The order of the arguments of :func:`DataFrame.to_html` and :func:`DataFrame.to_string` is rearranged to be consistent with each other. (:issue:`23614`) - :meth:`CategoricalIndex.reindex` now raises a ``ValueError`` if the target index is non-unique and not equal to the current index. It previously only raised if the target index was not of a categorical dtype (:issue:`23963`). - :func:`Series.to_list` and :func:`Index.to_list` are now aliases of ``Series.tolist`` respectively ``Index.tolist`` (:issue:`8826`) +- The result of ``SparseSeries.unstack`` is now a :class:`DataFrame` with sparse values, rather than a :class:`SparseDataFrame` (issue:`24372`). .. _whatsnew_0240.deprecations: @@ -1616,6 +1617,7 @@ Sparse - Bug in :meth:`SparseArary.unique` not returning the unique values (:issue:`19595`) - Bug in :meth:`SparseArray.nonzero` and :meth:`SparseDataFrame.dropna` returning shifted/incorrect results (:issue:`21172`) - Bug in :meth:`DataFrame.apply` where dtypes would lose sparseness (:issue:`23744`) +- Bug in :func:`concat` when concatenating a list of :class:`Series` with all-sparse values changing the ``fill_value`` and converting to a dense Series (:issue:`24371`) Style ^^^^^ diff --git a/pandas/core/dtypes/concat.py b/pandas/core/dtypes/concat.py index 0df0c01dbd47a..a90cfa4e4c906 100644 --- a/pandas/core/dtypes/concat.py +++ b/pandas/core/dtypes/concat.py @@ -66,19 +66,19 @@ def _get_series_result_type(result, objs=None): return appropriate class of Series concat input is either dict or array-like """ + from pandas import SparseSeries, SparseDataFrame, DataFrame + # concat Series with axis 1 if isinstance(result, dict): # concat Series with axis 1 - if all(is_sparse(c) for c in compat.itervalues(result)): - from pandas.core.sparse.api import SparseDataFrame + if all(isinstance(c, (SparseSeries, SparseDataFrame)) + for c in compat.itervalues(result)): return SparseDataFrame else: - from pandas.core.frame import DataFrame return DataFrame # otherwise it is a SingleBlockManager (axis = 0) if result._block.is_sparse: - from pandas.core.sparse.api import SparseSeries return SparseSeries else: return objs[0]._constructor diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py index 8319a8cc5417c..713a4b19c1fd5 100644 --- a/pandas/core/reshape/reshape.py +++ b/pandas/core/reshape/reshape.py @@ -11,8 +11,8 @@ from pandas.core.dtypes.cast import maybe_promote from pandas.core.dtypes.common import ( - ensure_platform_int, is_bool_dtype, is_extension_array_dtype, is_list_like, - is_object_dtype, needs_i8_conversion) + ensure_platform_int, is_bool_dtype, is_extension_array_dtype, + is_integer_dtype, is_list_like, is_object_dtype, needs_i8_conversion) from pandas.core.dtypes.missing import notna from pandas import compat @@ -853,6 +853,7 @@ def check_len(item, name): def _get_dummies_1d(data, prefix, prefix_sep='_', dummy_na=False, sparse=False, drop_first=False, dtype=None): + from pandas.core.reshape.concat import concat # Series avoids inconsistent NaN handling codes, levels = _factorize_from_iterable(Series(data)) @@ -909,7 +910,15 @@ def _make_col_name(prefix, prefix_sep, level): index = None if sparse: - sparse_series = {} + + if is_integer_dtype(dtype): + fill_value = 0 + elif dtype == bool: + fill_value = False + else: + fill_value = 0.0 + + sparse_series = [] N = len(data) sp_indices = [[] for _ in range(len(dummy_cols))] mask = codes != -1 @@ -926,12 +935,12 @@ def _make_col_name(prefix, prefix_sep, level): dummy_cols = dummy_cols[1:] for col, ixs in zip(dummy_cols, sp_indices): sarr = SparseArray(np.ones(len(ixs), dtype=dtype), - sparse_index=IntIndex(N, ixs), fill_value=0, + sparse_index=IntIndex(N, ixs), + fill_value=fill_value, dtype=dtype) - sparse_series[col] = Series(data=sarr, index=index) + sparse_series.append(Series(data=sarr, index=index, name=col)) - out = DataFrame(sparse_series, index=index, columns=dummy_cols, - dtype=dtype) + out = concat(sparse_series, axis=1, copy=False) return out else: diff --git a/pandas/tests/sparse/test_reshape.py b/pandas/tests/sparse/test_reshape.py index b492c47375bcf..d4ba672607982 100644 --- a/pandas/tests/sparse/test_reshape.py +++ b/pandas/tests/sparse/test_reshape.py @@ -35,4 +35,8 @@ def test_sparse_frame_unstack(sparse_df): def test_sparse_series_unstack(sparse_df, multi_index3): frame = pd.SparseSeries(np.ones(3), index=multi_index3).unstack() - tm.assert_sp_frame_equal(frame, sparse_df) + + arr = np.array([1, np.nan, np.nan]) + arrays = {i: pd.SparseArray(np.roll(arr, i)) for i in range(3)} + expected = pd.DataFrame(arrays) + tm.assert_frame_equal(frame, expected) From 8c58817bd0eaca46d7c0d0c975715ad37fe43710 Mon Sep 17 00:00:00 2001 From: topper-123 Date: Fri, 21 Dec 2018 18:12:10 +0000 Subject: [PATCH 04/15] API: Make Series.searchsorted return a scalar, when supplied a scalar (#23801) --- doc/source/whatsnew/v0.24.0.rst | 1 + pandas/core/base.py | 14 ++++++++++---- pandas/core/series.py | 6 ++++-- .../arrays/categorical/test_analytics.py | 3 +++ pandas/tests/indexes/multi/test_monotonic.py | 7 +++++++ pandas/tests/series/test_analytics.py | 19 +++++++++++-------- 6 files changed, 36 insertions(+), 14 deletions(-) diff --git a/doc/source/whatsnew/v0.24.0.rst b/doc/source/whatsnew/v0.24.0.rst index fd41330168dd1..343f17d8ff89c 100644 --- a/doc/source/whatsnew/v0.24.0.rst +++ b/doc/source/whatsnew/v0.24.0.rst @@ -1098,6 +1098,7 @@ Other API Changes has an improved ``KeyError`` message, and will not fail on duplicate column names with ``drop=True``. (:issue:`22484`) - Slicing a single row of a DataFrame with multiple ExtensionArrays of the same type now preserves the dtype, rather than coercing to object (:issue:`22784`) - :class:`DateOffset` attribute `_cacheable` and method `_should_cache` have been removed (:issue:`23118`) +- :meth:`Series.searchsorted`, when supplied a scalar value to search for, now returns a scalar instead of an array (:issue:`23801`). - :meth:`Categorical.searchsorted`, when supplied a scalar value to search for, now returns a scalar instead of an array (:issue:`23466`). - :meth:`Categorical.searchsorted` now raises a ``KeyError`` rather that a ``ValueError``, if a searched for key is not found in its categories (:issue:`23466`). - :meth:`Index.hasnans` and :meth:`Series.hasnans` now always return a python boolean. Previously, a python or a numpy boolean could be returned, depending on circumstances (:issue:`23294`). diff --git a/pandas/core/base.py b/pandas/core/base.py index 46f61c353056e..4a64ea0e56574 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -1387,8 +1387,14 @@ def factorize(self, sort=False, na_sentinel=-1): Returns ------- - indices : array of ints - Array of insertion points with the same shape as `value`. + int or array of int + A scalar or array of insertion points with the + same shape as `value`. + + .. versionchanged :: 0.24.0 + If `value` is a scalar, an int is now always returned. + Previously, scalar inputs returned an 1-item array for + :class:`Series` and :class:`Categorical`. See Also -------- @@ -1409,7 +1415,7 @@ def factorize(self, sort=False, na_sentinel=-1): dtype: int64 >>> x.searchsorted(4) - array([3]) + 3 >>> x.searchsorted([0, 4]) array([0, 3]) @@ -1426,7 +1432,7 @@ def factorize(self, sort=False, na_sentinel=-1): Categories (4, object): [apple < bread < cheese < milk] >>> x.searchsorted('bread') - array([1]) # Note: an array, not a scalar + 1 >>> x.searchsorted(['bread'], side='right') array([3]) diff --git a/pandas/core/series.py b/pandas/core/series.py index 773f2d17cf0fc..d642a221e4494 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -2215,8 +2215,10 @@ def __rmatmul__(self, other): def searchsorted(self, value, side='left', sorter=None): if sorter is not None: sorter = ensure_platform_int(sorter) - return self._values.searchsorted(Series(value)._values, - side=side, sorter=sorter) + result = self._values.searchsorted(Series(value)._values, + side=side, sorter=sorter) + + return result[0] if is_scalar(value) else result # ------------------------------------------------------------------- # Combination diff --git a/pandas/tests/arrays/categorical/test_analytics.py b/pandas/tests/arrays/categorical/test_analytics.py index 4251273e424dd..b2c9151e1fa94 100644 --- a/pandas/tests/arrays/categorical/test_analytics.py +++ b/pandas/tests/arrays/categorical/test_analytics.py @@ -8,6 +8,7 @@ from pandas.compat import PYPY from pandas import Categorical, Index, Series +from pandas.api.types import is_scalar import pandas.util.testing as tm @@ -86,9 +87,11 @@ def test_searchsorted(self): # Searching for single item argument, side='left' (default) res_cat = c1.searchsorted('apple') assert res_cat == 2 + assert is_scalar(res_cat) res_ser = s1.searchsorted('apple') assert res_ser == 2 + assert is_scalar(res_ser) # Searching for single item array, side='left' (default) res_cat = c1.searchsorted(['bread']) diff --git a/pandas/tests/indexes/multi/test_monotonic.py b/pandas/tests/indexes/multi/test_monotonic.py index 3c7db70b7e242..72e9bcc1e2eb1 100644 --- a/pandas/tests/indexes/multi/test_monotonic.py +++ b/pandas/tests/indexes/multi/test_monotonic.py @@ -5,6 +5,7 @@ import pandas as pd from pandas import Index, IntervalIndex, MultiIndex +from pandas.api.types import is_scalar def test_is_monotonic_increasing(): @@ -182,22 +183,28 @@ def test_searchsorted_monotonic(indices): # test searchsorted only for increasing if indices.is_monotonic_increasing: ssm_left = indices._searchsorted_monotonic(value, side='left') + assert is_scalar(ssm_left) assert expected_left == ssm_left ssm_right = indices._searchsorted_monotonic(value, side='right') + assert is_scalar(ssm_right) assert expected_right == ssm_right ss_left = indices.searchsorted(value, side='left') + assert is_scalar(ss_left) assert expected_left == ss_left ss_right = indices.searchsorted(value, side='right') + assert is_scalar(ss_right) assert expected_right == ss_right elif indices.is_monotonic_decreasing: ssm_left = indices._searchsorted_monotonic(value, side='left') + assert is_scalar(ssm_left) assert expected_left == ssm_left ssm_right = indices._searchsorted_monotonic(value, side='right') + assert is_scalar(ssm_right) assert expected_right == ssm_right else: diff --git a/pandas/tests/series/test_analytics.py b/pandas/tests/series/test_analytics.py index 0d8804dba83c1..73c6ea67ee8aa 100644 --- a/pandas/tests/series/test_analytics.py +++ b/pandas/tests/series/test_analytics.py @@ -16,6 +16,7 @@ from pandas import ( Categorical, CategoricalIndex, DataFrame, Series, bdate_range, compat, date_range, isna, notna) +from pandas.api.types import is_scalar from pandas.core.index import MultiIndex from pandas.core.indexes.datetimes import Timestamp from pandas.core.indexes.timedeltas import Timedelta @@ -1364,17 +1365,19 @@ def test_numpy_repeat(self): def test_searchsorted(self): s = Series([1, 2, 3]) - idx = s.searchsorted(1, side='left') - tm.assert_numpy_array_equal(idx, np.array([0], dtype=np.intp)) + result = s.searchsorted(1, side='left') + assert is_scalar(result) + assert result == 0 - idx = s.searchsorted(1, side='right') - tm.assert_numpy_array_equal(idx, np.array([1], dtype=np.intp)) + result = s.searchsorted(1, side='right') + assert is_scalar(result) + assert result == 1 def test_searchsorted_numeric_dtypes_scalar(self): s = Series([1, 2, 90, 1000, 3e9]) r = s.searchsorted(30) - e = 2 - assert r == e + assert is_scalar(r) + assert r == 2 r = s.searchsorted([30]) e = np.array([2], dtype=np.intp) @@ -1390,8 +1393,8 @@ def test_search_sorted_datetime64_scalar(self): s = Series(pd.date_range('20120101', periods=10, freq='2D')) v = pd.Timestamp('20120102') r = s.searchsorted(v) - e = 1 - assert r == e + assert is_scalar(r) + assert r == 1 def test_search_sorted_datetime64_list(self): s = Series(pd.date_range('20120101', periods=10, freq='2D')) From aa3d56effe18031e6377a0714a145287445a4b89 Mon Sep 17 00:00:00 2001 From: gfyoung Date: Fri, 21 Dec 2018 17:05:33 -0800 Subject: [PATCH 05/15] TST: Add missing match parameter in test_move (#24393) xref gh-23922 --- pandas/tests/util/test_move.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/util/test_move.py b/pandas/tests/util/test_move.py index c12e2f7a167ad..ef98f2032e6ca 100644 --- a/pandas/tests/util/test_move.py +++ b/pandas/tests/util/test_move.py @@ -23,7 +23,7 @@ def test_more_than_one_ref(): b = b"testing" - with pytest.raises(BadMove) as e: + with pytest.raises(BadMove, match="testing") as e: def handle_success(type_, value, tb): assert value.args[0] is b return type(e).handle_success(e, type_, value, tb) # super From 3e0358d869f8a76b3223e8bb313a92edbaa369db Mon Sep 17 00:00:00 2001 From: h-vetinari <33685575+h-vetinari@users.noreply.github.com> Date: Sat, 22 Dec 2018 02:07:31 +0100 Subject: [PATCH 06/15] CLN: post-post numpy bump (#24365) * CLN: post-post numpy bumpy * Lint --- pandas/_libs/lib.pyx | 8 ++++---- pandas/_libs/tslibs/timedeltas.pyx | 3 --- pandas/tests/arrays/categorical/test_api.py | 1 - pandas/tests/indexes/datetimes/test_misc.py | 1 - .../indexes/timedeltas/test_arithmetic.py | 18 ++++++------------ pandas/tests/util/test_util.py | 1 - 6 files changed, 10 insertions(+), 22 deletions(-) diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 2736133a79d8e..3bc39979b0fc1 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -153,10 +153,10 @@ def is_scalar(val: object) -> bool: """ return (cnp.PyArray_IsAnyScalar(val) - # As of numpy-1.9, PyArray_IsAnyScalar misses bytearrays on Py3. - or isinstance(val, (bytes, Fraction, Number)) - # We differ from numpy (as of 1.10), which claims that None is - # not scalar in np.isscalar(). + # PyArray_IsAnyScalar is always False for bytearrays on Py3 + or isinstance(val, (Fraction, Number)) + # We differ from numpy, which claims that None is not scalar; + # see np.isscalar or val is None or PyDate_Check(val) or PyDelta_Check(val) diff --git a/pandas/_libs/tslibs/timedeltas.pyx b/pandas/_libs/tslibs/timedeltas.pyx index a70e245fa1504..219d3cd3ece75 100644 --- a/pandas/_libs/tslibs/timedeltas.pyx +++ b/pandas/_libs/tslibs/timedeltas.pyx @@ -161,9 +161,6 @@ cpdef convert_to_timedelta64(object ts, object unit): - None/NaT Return an ns based int64 - - # kludgy here until we have a timedelta scalar - # handle the numpy < 1.7 case """ if checknull_with_nat(ts): return np.timedelta64(NPY_NAT) diff --git a/pandas/tests/arrays/categorical/test_api.py b/pandas/tests/arrays/categorical/test_api.py index 505c2a8eab7cd..348bb947efef7 100644 --- a/pandas/tests/arrays/categorical/test_api.py +++ b/pandas/tests/arrays/categorical/test_api.py @@ -455,7 +455,6 @@ def test_codes_immutable(self): c.codes = np.array([0, 1, 2, 0, 1], dtype='int8') # changes in the codes array should raise - # np 1.6.1 raises RuntimeError rather than ValueError codes = c.codes with pytest.raises(ValueError): diff --git a/pandas/tests/indexes/datetimes/test_misc.py b/pandas/tests/indexes/datetimes/test_misc.py index 8d9f496b70079..cec181161fc11 100644 --- a/pandas/tests/indexes/datetimes/test_misc.py +++ b/pandas/tests/indexes/datetimes/test_misc.py @@ -188,7 +188,6 @@ def test_datetimeindex_accessors(self): assert sum(dti.is_year_end) == 1 # Ensure is_start/end accessors throw ValueError for CustomBusinessDay, - # CBD requires np >= 1.7 bday_egypt = offsets.CustomBusinessDay(weekmask='Sun Mon Tue Wed Thu') dti = date_range(datetime(2013, 4, 30), periods=5, freq=bday_egypt) pytest.raises(ValueError, lambda: dti.is_month_start) diff --git a/pandas/tests/indexes/timedeltas/test_arithmetic.py b/pandas/tests/indexes/timedeltas/test_arithmetic.py index d2d116055f17c..b75c89631d450 100644 --- a/pandas/tests/indexes/timedeltas/test_arithmetic.py +++ b/pandas/tests/indexes/timedeltas/test_arithmetic.py @@ -1,7 +1,6 @@ # -*- coding: utf-8 -*- from datetime import timedelta -from distutils.version import LooseVersion import numpy as np import pytest @@ -197,15 +196,13 @@ def test_ops_ndarray(self): other = pd.to_timedelta(['1 day']).values expected = pd.to_timedelta(['2 days']).values tm.assert_numpy_array_equal(td + other, expected) - if LooseVersion(np.__version__) >= LooseVersion('1.8'): - tm.assert_numpy_array_equal(other + td, expected) + tm.assert_numpy_array_equal(other + td, expected) pytest.raises(TypeError, lambda: td + np.array([1])) pytest.raises(TypeError, lambda: np.array([1]) + td) expected = pd.to_timedelta(['0 days']).values tm.assert_numpy_array_equal(td - other, expected) - if LooseVersion(np.__version__) >= LooseVersion('1.8'): - tm.assert_numpy_array_equal(-other + td, expected) + tm.assert_numpy_array_equal(-other + td, expected) pytest.raises(TypeError, lambda: td - np.array([1])) pytest.raises(TypeError, lambda: np.array([1]) - td) @@ -217,21 +214,18 @@ def test_ops_ndarray(self): tm.assert_numpy_array_equal(td / other, np.array([1], dtype=np.float64)) - if LooseVersion(np.__version__) >= LooseVersion('1.8'): - tm.assert_numpy_array_equal(other / td, - np.array([1], dtype=np.float64)) + tm.assert_numpy_array_equal(other / td, + np.array([1], dtype=np.float64)) # timedelta, datetime other = pd.to_datetime(['2000-01-01']).values expected = pd.to_datetime(['2000-01-02']).values tm.assert_numpy_array_equal(td + other, expected) - if LooseVersion(np.__version__) >= LooseVersion('1.8'): - tm.assert_numpy_array_equal(other + td, expected) + tm.assert_numpy_array_equal(other + td, expected) expected = pd.to_datetime(['1999-12-31']).values tm.assert_numpy_array_equal(-td + other, expected) - if LooseVersion(np.__version__) >= LooseVersion('1.8'): - tm.assert_numpy_array_equal(other - td, expected) + tm.assert_numpy_array_equal(other - td, expected) def test_tdi_ops_attributes(self): rng = timedelta_range('2 days', periods=5, freq='2D', name='x') diff --git a/pandas/tests/util/test_util.py b/pandas/tests/util/test_util.py index f9282ff15612d..a2dc9b699566a 100644 --- a/pandas/tests/util/test_util.py +++ b/pandas/tests/util/test_util.py @@ -30,7 +30,6 @@ def test_rands_array_2d(): def test_numpy_err_state_is_default(): - # The defaults since numpy 1.6.0 expected = {"over": "warn", "divide": "warn", "invalid": "warn", "under": "ignore"} import numpy as np From ffa4c7775d242458398cd98f90d18efc63ff0dfe Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Sun, 23 Dec 2018 09:15:41 -0600 Subject: [PATCH 07/15] DEPR: Undeprecate period - integer addition (#24352) --- doc/source/whatsnew/v0.24.0.rst | 24 ++--- pandas/_libs/tslibs/period.pyx | 6 +- pandas/core/arrays/datetimelike.py | 12 ++- pandas/tests/arithmetic/test_period.py | 116 +++++++++------------ pandas/tests/indexes/period/test_period.py | 7 +- pandas/tests/scalar/period/test_asfreq.py | 9 +- pandas/tests/scalar/period/test_period.py | 61 +++++------ 7 files changed, 94 insertions(+), 141 deletions(-) diff --git a/doc/source/whatsnew/v0.24.0.rst b/doc/source/whatsnew/v0.24.0.rst index 343f17d8ff89c..724cfddb1b94c 100644 --- a/doc/source/whatsnew/v0.24.0.rst +++ b/doc/source/whatsnew/v0.24.0.rst @@ -1153,25 +1153,19 @@ Deprecations .. _whatsnew_0240.deprecations.datetimelike_int_ops: -Integer Addition/Subtraction with Datetime-like Classes Is Deprecated -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -In the past, users could add or subtract integers or integer-dtypes arrays -from :class:`Period`, :class:`PeriodIndex`, and in some cases -:class:`Timestamp`, :class:`DatetimeIndex` and :class:`TimedeltaIndex`. +Integer Addition/Subtraction with Datetimes and Timedeltas is Deprecated +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +In the past, users could—in some cases—add or subtract integers or integer-dtype +arrays from :class:`Timestamp`, :class:`DatetimeIndex` and :class:`TimedeltaIndex`. This usage is now deprecated. Instead add or subtract integer multiples of -the object's ``freq`` attribute. The result of subtraction of :class:`Period` -objects will be agnostic of the multiplier of the objects' ``freq`` attribute -(:issue:`21939`, :issue:`23878`). +the object's ``freq`` attribute (:issue:`21939`, :issue:`23878`). *Previous Behavior*: .. code-block:: ipython - In [3]: per = pd.Period('2016Q1') - In [4]: per + 3 - Out[4]: Period('2016Q4', 'Q-DEC') - In [5]: ts = pd.Timestamp('1994-05-06 12:15:16', freq=pd.offsets.Hour()) In [6]: ts + 2 Out[6]: Timestamp('1994-05-06 14:15:16', freq='H') @@ -1189,12 +1183,6 @@ objects will be agnostic of the multiplier of the objects' ``freq`` attribute .. ipython:: python :okwarning: - per = pd.Period('2016Q1') - per + 3 - - per = pd.Period('2016Q1') - per + 3 * per.freq - ts = pd.Timestamp('1994-05-06 12:15:16', freq=pd.offsets.Hour()) ts + 2 * ts.freq diff --git a/pandas/_libs/tslibs/period.pyx b/pandas/_libs/tslibs/period.pyx index 7f5305065382c..6a257d40dd44b 100644 --- a/pandas/_libs/tslibs/period.pyx +++ b/pandas/_libs/tslibs/period.pyx @@ -32,7 +32,7 @@ cdef extern from "src/datetime/np_datetime.h": cimport util from util cimport is_period_object, is_string_object -from timestamps import Timestamp, maybe_integer_op_deprecated +from timestamps import Timestamp from timezones cimport is_utc, is_tzlocal, get_dst_info from timedeltas import Timedelta from timedeltas cimport delta_to_nanoseconds @@ -1655,8 +1655,6 @@ cdef class _Period(object): elif other is NaT: return NaT elif util.is_integer_object(other): - maybe_integer_op_deprecated(self) - ordinal = self.ordinal + other * self.freq.n return Period(ordinal=ordinal, freq=self.freq) elif (PyDateTime_Check(other) or @@ -1683,8 +1681,6 @@ cdef class _Period(object): neg_other = -other return self + neg_other elif util.is_integer_object(other): - maybe_integer_op_deprecated(self) - ordinal = self.ordinal - other * self.freq.n return Period(ordinal=ordinal, freq=self.freq) elif is_period_object(other): diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index c01b04991e52b..b8364760caa37 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -905,7 +905,8 @@ def __add__(self, other): elif lib.is_integer(other): # This check must come after the check for np.timedelta64 # as is_integer returns True for these - maybe_integer_op_deprecated(self) + if not is_period_dtype(self): + maybe_integer_op_deprecated(self) result = self._time_shift(other) # array-like others @@ -919,7 +920,8 @@ def __add__(self, other): # DatetimeIndex, ndarray[datetime64] return self._add_datetime_arraylike(other) elif is_integer_dtype(other): - maybe_integer_op_deprecated(self) + if not is_period_dtype(self): + maybe_integer_op_deprecated(self) result = self._addsub_int_array(other, operator.add) elif is_float_dtype(other): # Explicitly catch invalid dtypes @@ -966,7 +968,8 @@ def __sub__(self, other): elif lib.is_integer(other): # This check must come after the check for np.timedelta64 # as is_integer returns True for these - maybe_integer_op_deprecated(self) + if not is_period_dtype(self): + maybe_integer_op_deprecated(self) result = self._time_shift(-other) elif isinstance(other, Period): @@ -986,7 +989,8 @@ def __sub__(self, other): # PeriodIndex result = self._sub_period_array(other) elif is_integer_dtype(other): - maybe_integer_op_deprecated(self) + if not is_period_dtype(self): + maybe_integer_op_deprecated(self) result = self._addsub_int_array(other, operator.sub) elif isinstance(other, ABCIndexClass): raise TypeError("cannot subtract {cls} and {typ}" diff --git a/pandas/tests/arithmetic/test_period.py b/pandas/tests/arithmetic/test_period.py index 6288e4ec26e1e..469353042a878 100644 --- a/pandas/tests/arithmetic/test_period.py +++ b/pandas/tests/arithmetic/test_period.py @@ -659,14 +659,10 @@ def test_pi_sub_offset_array(self, box): def test_pi_add_iadd_int(self, one): # Variants of `one` for #19012 rng = pd.period_range('2000-01-01 09:00', freq='H', periods=10) - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False, - clear=[pd.core.arrays.datetimelike]): - result = rng + one + result = rng + one expected = pd.period_range('2000-01-01 10:00', freq='H', periods=10) tm.assert_index_equal(result, expected) - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False, - clear=[pd.core.arrays.datetimelike]): - rng += one + rng += one tm.assert_index_equal(rng, expected) def test_pi_sub_isub_int(self, one): @@ -675,24 +671,18 @@ def test_pi_sub_isub_int(self, one): the integer 1, e.g. int, long, np.int64, np.uint8, ... """ rng = pd.period_range('2000-01-01 09:00', freq='H', periods=10) - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False, - clear=[pd.core.arrays.datetimelike]): - result = rng - one + result = rng - one expected = pd.period_range('2000-01-01 08:00', freq='H', periods=10) tm.assert_index_equal(result, expected) - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False, - clear=[pd.core.arrays.datetimelike]): - rng -= one + rng -= one tm.assert_index_equal(rng, expected) @pytest.mark.parametrize('five', [5, np.array(5, dtype=np.int64)]) def test_pi_sub_intlike(self, five): rng = period_range('2007-01', periods=50) - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False, - clear=[pd.core.arrays.datetimelike]): - result = rng - five - exp = rng + (-five) + result = rng - five + exp = rng + (-five) tm.assert_index_equal(result, exp) def test_pi_sub_isub_offset(self): @@ -757,9 +747,8 @@ def test_pi_add_intarray(self, int_holder, op): # GH#19959 pi = pd.PeriodIndex([pd.Period('2015Q1'), pd.Period('NaT')]) other = int_holder([4, -1]) - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False, - clear=[pd.core.arrays.datetimelike]): - result = op(pi, other) + + result = op(pi, other) expected = pd.PeriodIndex([pd.Period('2016Q1'), pd.Period('NaT')]) tm.assert_index_equal(result, expected) @@ -768,16 +757,13 @@ def test_pi_sub_intarray(self, int_holder): # GH#19959 pi = pd.PeriodIndex([pd.Period('2015Q1'), pd.Period('NaT')]) other = int_holder([4, -1]) - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False, - clear=[pd.core.arrays.datetimelike]): - result = pi - other + + result = pi - other expected = pd.PeriodIndex([pd.Period('2014Q1'), pd.Period('NaT')]) tm.assert_index_equal(result, expected) with pytest.raises(TypeError): - with tm.assert_produces_warning(FutureWarning, - check_stacklevel=False): - other - pi + other - pi # --------------------------------------------------------------- # Timedelta-like (timedelta, timedelta64, Timedelta, Tick) @@ -1039,12 +1025,11 @@ def test_pi_ops(self): expected = PeriodIndex(['2011-03', '2011-04', '2011-05', '2011-06'], freq='M', name='idx') - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False, - clear=[pd.core.arrays.datetimelike]): - self._check(idx, lambda x: x + 2, expected) - self._check(idx, lambda x: 2 + x, expected) - self._check(idx + 2, lambda x: x - 2, idx) + self._check(idx, lambda x: x + 2, expected) + self._check(idx, lambda x: 2 + x, expected) + + self._check(idx + 2, lambda x: x - 2, idx) result = idx - Period('2011-01', freq='M') off = idx.freq @@ -1089,53 +1074,50 @@ def test_pi_ops_nat(self): freq='M', name='idx') expected = PeriodIndex(['2011-03', '2011-04', 'NaT', '2011-06'], freq='M', name='idx') - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False, - clear=[pd.core.arrays.datetimelike]): - self._check(idx, lambda x: x + 2, expected) - self._check(idx, lambda x: 2 + x, expected) - self._check(idx, lambda x: np.add(x, 2), expected) - self._check(idx + 2, lambda x: x - 2, idx) - self._check(idx + 2, lambda x: np.subtract(x, 2), idx) + self._check(idx, lambda x: x + 2, expected) + self._check(idx, lambda x: 2 + x, expected) + self._check(idx, lambda x: np.add(x, 2), expected) + + self._check(idx + 2, lambda x: x - 2, idx) + self._check(idx + 2, lambda x: np.subtract(x, 2), idx) # freq with mult idx = PeriodIndex(['2011-01', '2011-02', 'NaT', '2011-04'], freq='2M', name='idx') expected = PeriodIndex(['2011-07', '2011-08', 'NaT', '2011-10'], freq='2M', name='idx') - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False, - clear=[pd.core.arrays.datetimelike]): - self._check(idx, lambda x: x + 3, expected) - self._check(idx, lambda x: 3 + x, expected) - self._check(idx, lambda x: np.add(x, 3), expected) - self._check(idx + 3, lambda x: x - 3, idx) - self._check(idx + 3, lambda x: np.subtract(x, 3), idx) + self._check(idx, lambda x: x + 3, expected) + self._check(idx, lambda x: 3 + x, expected) + self._check(idx, lambda x: np.add(x, 3), expected) + + self._check(idx + 3, lambda x: x - 3, idx) + self._check(idx + 3, lambda x: np.subtract(x, 3), idx) def test_pi_ops_array_int(self): - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False, - clear=[pd.core.arrays.datetimelike]): - idx = PeriodIndex(['2011-01', '2011-02', 'NaT', '2011-04'], - freq='M', name='idx') - f = lambda x: x + np.array([1, 2, 3, 4]) - exp = PeriodIndex(['2011-02', '2011-04', 'NaT', '2011-08'], - freq='M', name='idx') - self._check(idx, f, exp) - - f = lambda x: np.add(x, np.array([4, -1, 1, 2])) - exp = PeriodIndex(['2011-05', '2011-01', 'NaT', '2011-06'], - freq='M', name='idx') - self._check(idx, f, exp) - - f = lambda x: x - np.array([1, 2, 3, 4]) - exp = PeriodIndex(['2010-12', '2010-12', 'NaT', '2010-12'], - freq='M', name='idx') - self._check(idx, f, exp) - - f = lambda x: np.subtract(x, np.array([3, 2, 3, -2])) - exp = PeriodIndex(['2010-10', '2010-12', 'NaT', '2011-06'], - freq='M', name='idx') - self._check(idx, f, exp) + + idx = PeriodIndex(['2011-01', '2011-02', 'NaT', '2011-04'], + freq='M', name='idx') + f = lambda x: x + np.array([1, 2, 3, 4]) + exp = PeriodIndex(['2011-02', '2011-04', 'NaT', '2011-08'], + freq='M', name='idx') + self._check(idx, f, exp) + + f = lambda x: np.add(x, np.array([4, -1, 1, 2])) + exp = PeriodIndex(['2011-05', '2011-01', 'NaT', '2011-06'], + freq='M', name='idx') + self._check(idx, f, exp) + + f = lambda x: x - np.array([1, 2, 3, 4]) + exp = PeriodIndex(['2010-12', '2010-12', 'NaT', '2010-12'], + freq='M', name='idx') + self._check(idx, f, exp) + + f = lambda x: np.subtract(x, np.array([3, 2, 3, -2])) + exp = PeriodIndex(['2010-10', '2010-12', 'NaT', '2011-06'], + freq='M', name='idx') + self._check(idx, f, exp) def test_pi_ops_offset(self): idx = PeriodIndex(['2011-01-01', '2011-02-01', '2011-03-01', diff --git a/pandas/tests/indexes/period/test_period.py b/pandas/tests/indexes/period/test_period.py index 5d78333016f74..a5169aba2db33 100644 --- a/pandas/tests/indexes/period/test_period.py +++ b/pandas/tests/indexes/period/test_period.py @@ -339,10 +339,9 @@ def test_is_(self): assert not index.is_(index[:]) assert not index.is_(index.asfreq('M')) assert not index.is_(index.asfreq('A')) - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - # GH#22535 - assert not index.is_(index - 2) - assert not index.is_(index - 0) + + assert not index.is_(index - 2) + assert not index.is_(index - 0) def test_contains(self): rng = period_range('2007-01', freq='M', periods=10) diff --git a/pandas/tests/scalar/period/test_asfreq.py b/pandas/tests/scalar/period/test_asfreq.py index 24f2ed88936b7..f46f2da6c076d 100644 --- a/pandas/tests/scalar/period/test_asfreq.py +++ b/pandas/tests/scalar/period/test_asfreq.py @@ -5,7 +5,6 @@ from pandas.errors import OutOfBoundsDatetime from pandas import Period, offsets -from pandas.util import testing as tm class TestFreqConversion(object): @@ -16,17 +15,15 @@ def test_asfreq_near_zero(self, freq): per = Period('0001-01-01', freq=freq) tup1 = (per.year, per.hour, per.day) - with tm.assert_produces_warning(FutureWarning): - prev = per - 1 + prev = per - 1 assert prev.ordinal == per.ordinal - 1 tup2 = (prev.year, prev.month, prev.day) assert tup2 < tup1 def test_asfreq_near_zero_weekly(self): # GH#19834 - with tm.assert_produces_warning(FutureWarning): - per1 = Period('0001-01-01', 'D') + 6 - per2 = Period('0001-01-01', 'D') - 6 + per1 = Period('0001-01-01', 'D') + 6 + per2 = Period('0001-01-01', 'D') - 6 week1 = per1.asfreq('W') week2 = per2.asfreq('W') assert week1 != week2 diff --git a/pandas/tests/scalar/period/test_period.py b/pandas/tests/scalar/period/test_period.py index 715a596999e42..d0f87618ad3af 100644 --- a/pandas/tests/scalar/period/test_period.py +++ b/pandas/tests/scalar/period/test_period.py @@ -303,11 +303,10 @@ def test_multiples(self): assert result1.freq == offsets.YearEnd(2) assert result2.freq == offsets.YearEnd() - with tm.assert_produces_warning(FutureWarning): - assert (result1 + 1).ordinal == result1.ordinal + 2 - assert (1 + result1).ordinal == result1.ordinal + 2 - assert (result1 - 1).ordinal == result2.ordinal - 2 - assert (-1 + result1).ordinal == result2.ordinal - 2 + assert (result1 + 1).ordinal == result1.ordinal + 2 + assert (1 + result1).ordinal == result1.ordinal + 2 + assert (result1 - 1).ordinal == result2.ordinal - 2 + assert (-1 + result1).ordinal == result2.ordinal - 2 @pytest.mark.parametrize('month', MONTHS) def test_period_cons_quarterly(self, month): @@ -331,8 +330,7 @@ def test_period_cons_annual(self, month): stamp = exp.to_timestamp('D', how='end') + timedelta(days=30) p = Period(stamp, freq=freq) - with tm.assert_produces_warning(FutureWarning): - assert p == exp + 1 + assert p == exp + 1 assert isinstance(p, Period) @pytest.mark.parametrize('day', DAYS) @@ -385,16 +383,14 @@ def test_period_cons_mult(self): assert p2.freq == offsets.MonthEnd() assert p2.freqstr == 'M' - with tm.assert_produces_warning(FutureWarning): - result = p1 + 1 - assert result.ordinal == (p2 + 3).ordinal + result = p1 + 1 + assert result.ordinal == (p2 + 3).ordinal assert result.freq == p1.freq assert result.freqstr == '3M' - with tm.assert_produces_warning(FutureWarning): - result = p1 - 1 - assert result.ordinal == (p2 - 3).ordinal + result = p1 - 1 + assert result.ordinal == (p2 - 3).ordinal assert result.freq == p1.freq assert result.freqstr == '3M' @@ -428,27 +424,23 @@ def test_period_cons_combined(self): assert p3.freq == offsets.Hour() assert p3.freqstr == 'H' - with tm.assert_produces_warning(FutureWarning): - result = p1 + 1 - assert result.ordinal == (p3 + 25).ordinal + result = p1 + 1 + assert result.ordinal == (p3 + 25).ordinal assert result.freq == p1.freq assert result.freqstr == '25H' - with tm.assert_produces_warning(FutureWarning): - result = p2 + 1 - assert result.ordinal == (p3 + 25).ordinal + result = p2 + 1 + assert result.ordinal == (p3 + 25).ordinal assert result.freq == p2.freq assert result.freqstr == '25H' - with tm.assert_produces_warning(FutureWarning): - result = p1 - 1 - assert result.ordinal == (p3 - 25).ordinal + result = p1 - 1 + assert result.ordinal == (p3 - 25).ordinal assert result.freq == p1.freq assert result.freqstr == '25H' - with tm.assert_produces_warning(FutureWarning): - result = p2 - 1 - assert result.ordinal == (p3 - 25).ordinal + result = p2 - 1 + assert result.ordinal == (p3 - 25).ordinal assert result.freq == p2.freq assert result.freqstr == '25H' @@ -803,16 +795,14 @@ def test_properties_quarterly(self): # for x in range(3): for qd in (qedec_date, qejan_date, qejun_date): - with tm.assert_produces_warning(FutureWarning): - assert (qd + x).qyear == 2007 - assert (qd + x).quarter == x + 1 + assert (qd + x).qyear == 2007 + assert (qd + x).quarter == x + 1 def test_properties_monthly(self): # Test properties on Periods with daily frequency. m_date = Period(freq='M', year=2007, month=1) for x in range(11): - with tm.assert_produces_warning(FutureWarning): - m_ival_x = m_date + x + m_ival_x = m_date + x assert m_ival_x.year == 2007 if 1 <= x + 1 <= 3: assert m_ival_x.quarter == 1 @@ -832,8 +822,7 @@ def test_properties_weekly(self): assert w_date.quarter == 1 assert w_date.month == 1 assert w_date.week == 1 - with tm.assert_produces_warning(FutureWarning): - assert (w_date - 1).week == 52 + assert (w_date - 1).week == 52 assert w_date.days_in_month == 31 assert Period(freq='W', year=2012, month=2, day=1).days_in_month == 29 @@ -845,8 +834,7 @@ def test_properties_weekly_legacy(self): assert w_date.quarter == 1 assert w_date.month == 1 assert w_date.week == 1 - with tm.assert_produces_warning(FutureWarning): - assert (w_date - 1).week == 52 + assert (w_date - 1).week == 52 assert w_date.days_in_month == 31 exp = Period(freq='W', year=2012, month=2, day=1) @@ -1039,9 +1027,8 @@ def test_sub_delta(self): def test_add_integer(self): per1 = Period(freq='D', year=2008, month=1, day=1) per2 = Period(freq='D', year=2008, month=1, day=2) - with tm.assert_produces_warning(FutureWarning): - assert per1 + 1 == per2 - assert 1 + per1 == per2 + assert per1 + 1 == per2 + assert 1 + per1 == per2 def test_add_sub_nat(self): # GH#13071 From 2dce6b1e86741aa8d5b2f1b97ca18752d196310b Mon Sep 17 00:00:00 2001 From: Matthew Roeschke Date: Sun, 23 Dec 2018 07:27:56 -0800 Subject: [PATCH 08/15] CLN: tslibs imports and unused variables (#24401) --- pandas/_libs/tslib.pyx | 1 - pandas/_libs/tslibs/ccalendar.pyx | 4 ---- pandas/_libs/tslibs/conversion.pyx | 14 ++------------ pandas/_libs/tslibs/fields.pyx | 18 ++++++------------ pandas/_libs/tslibs/offsets.pyx | 1 - pandas/_libs/tslibs/parsing.pyx | 2 -- pandas/_libs/tslibs/period.pyx | 2 +- pandas/_libs/tslibs/resolution.pyx | 2 -- pandas/_libs/tslibs/strptime.pyx | 4 +--- pandas/_libs/tslibs/timedeltas.pyx | 1 - pandas/_libs/tslibs/timezones.pyx | 4 ---- 11 files changed, 10 insertions(+), 43 deletions(-) diff --git a/pandas/_libs/tslib.pyx b/pandas/_libs/tslib.pyx index efabc5ad0b1ba..3f2b49d141e18 100644 --- a/pandas/_libs/tslib.pyx +++ b/pandas/_libs/tslib.pyx @@ -1,6 +1,5 @@ # -*- coding: utf-8 -*- import cython -from cython import Py_ssize_t from cpython.datetime cimport (PyDateTime_Check, PyDate_Check, PyDateTime_CheckExact, diff --git a/pandas/_libs/tslibs/ccalendar.pyx b/pandas/_libs/tslibs/ccalendar.pyx index 587213049af85..91e7d83012bad 100644 --- a/pandas/_libs/tslibs/ccalendar.pyx +++ b/pandas/_libs/tslibs/ccalendar.pyx @@ -5,7 +5,6 @@ Cython implementations of functions resembling the stdlib calendar module """ import cython -from cython import Py_ssize_t from numpy cimport int64_t, int32_t @@ -151,12 +150,9 @@ cpdef int32_t get_week_of_year(int year, int month, int day) nogil: Assumes the inputs describe a valid date. """ cdef: - bint isleap int32_t doy, dow int woy - isleap = is_leapyear(year) - doy = get_day_of_year(year, month, day) dow = dayofweek(year, month, day) diff --git a/pandas/_libs/tslibs/conversion.pyx b/pandas/_libs/tslibs/conversion.pyx index e6e7884f05b20..81f2aeae68136 100644 --- a/pandas/_libs/tslibs/conversion.pyx +++ b/pandas/_libs/tslibs/conversion.pyx @@ -1,6 +1,5 @@ # -*- coding: utf-8 -*- import cython -from cython import Py_ssize_t import numpy as np cimport numpy as cnp @@ -1133,7 +1132,7 @@ def normalize_date(dt: object) -> datetime: @cython.wraparound(False) @cython.boundscheck(False) -def normalize_i8_timestamps(int64_t[:] stamps, object tz=None): +def normalize_i8_timestamps(int64_t[:] stamps, object tz): """ Normalize each of the (nanosecond) timezone aware timestamps in the given array by rounding down to the beginning of the day (i.e. midnight). @@ -1152,7 +1151,6 @@ def normalize_i8_timestamps(int64_t[:] stamps, object tz=None): Py_ssize_t n = len(stamps) int64_t[:] result = np.empty(n, dtype=np.int64) - tz = maybe_get_tz(tz) result = _normalize_local(stamps, tz) return result.base # .base to access underlying np.ndarray @@ -1185,15 +1183,7 @@ cdef int64_t[:] _normalize_local(int64_t[:] stamps, tzinfo tz): npy_datetimestruct dts int64_t delta, local_val - if is_utc(tz): - with nogil: - for i in range(n): - if stamps[i] == NPY_NAT: - result[i] = NPY_NAT - continue - dt64_to_dtstruct(stamps[i], &dts) - result[i] = _normalized_stamp(&dts) - elif is_tzlocal(tz): + if is_tzlocal(tz): for i in range(n): if stamps[i] == NPY_NAT: result[i] = NPY_NAT diff --git a/pandas/_libs/tslibs/fields.pyx b/pandas/_libs/tslibs/fields.pyx index 72157c2fcb2f3..df2a189097b78 100644 --- a/pandas/_libs/tslibs/fields.pyx +++ b/pandas/_libs/tslibs/fields.pyx @@ -37,7 +37,7 @@ def get_time_micros(ndarray[int64_t] dtindex): ndarray[int64_t] micros micros = np.mod(dtindex, DAY_SECONDS * 1000000000, dtype=np.int64) - micros //= 1000LL + micros //= 1000 return micros @@ -48,12 +48,10 @@ def build_field_sarray(int64_t[:] dtindex): Datetime as int64 representation to a structured array of fields """ cdef: - Py_ssize_t i, count = 0 + Py_ssize_t i, count = len(dtindex) npy_datetimestruct dts ndarray[int32_t] years, months, days, hours, minutes, seconds, mus - count = len(dtindex) - sa_dtype = [('Y', 'i4'), # year ('M', 'i4'), # month ('D', 'i4'), # day @@ -93,12 +91,11 @@ def get_date_name_field(int64_t[:] dtindex, object field, object locale=None): name based on requested field (e.g. weekday_name) """ cdef: - Py_ssize_t i, count = 0 + Py_ssize_t i, count = len(dtindex) ndarray[object] out, names npy_datetimestruct dts int dow - count = len(dtindex) out = np.empty(count, dtype=object) if field == 'day_name' or field == 'weekday_name': @@ -147,7 +144,7 @@ def get_start_end_field(int64_t[:] dtindex, object field, """ cdef: Py_ssize_t i - int count = 0 + int count = len(dtindex) bint is_business = 0 int end_month = 12 int start_month = 1 @@ -162,7 +159,6 @@ def get_start_end_field(int64_t[:] dtindex, object field, [0, 31, 60, 91, 121, 152, 182, 213, 244, 274, 305, 335, 366]], dtype=np.int32) - count = len(dtindex) out = np.zeros(count, dtype='int8') if freqstr: @@ -388,11 +384,10 @@ def get_date_field(ndarray[int64_t] dtindex, object field): field and return an array of these values. """ cdef: - Py_ssize_t i, count = 0 + Py_ssize_t i, count = len(dtindex) ndarray[int32_t] out npy_datetimestruct dts - count = len(dtindex) out = np.empty(count, dtype='i4') if field == 'Y': @@ -551,11 +546,10 @@ def get_timedelta_field(int64_t[:] tdindex, object field): field and return an array of these values. """ cdef: - Py_ssize_t i, count = 0 + Py_ssize_t i, count = len(tdindex) ndarray[int32_t] out pandas_timedeltastruct tds - count = len(tdindex) out = np.empty(count, dtype='i4') if field == 'days': diff --git a/pandas/_libs/tslibs/offsets.pyx b/pandas/_libs/tslibs/offsets.pyx index 7685ddd76d4b6..0720f5094fa51 100644 --- a/pandas/_libs/tslibs/offsets.pyx +++ b/pandas/_libs/tslibs/offsets.pyx @@ -1,7 +1,6 @@ # -*- coding: utf-8 -*- import cython -from cython import Py_ssize_t import time from cpython.datetime cimport (PyDateTime_IMPORT, diff --git a/pandas/_libs/tslibs/parsing.pyx b/pandas/_libs/tslibs/parsing.pyx index 9a01bf378e549..6ec49e4fa92ff 100644 --- a/pandas/_libs/tslibs/parsing.pyx +++ b/pandas/_libs/tslibs/parsing.pyx @@ -6,8 +6,6 @@ import sys import re import time -from cython import Py_ssize_t - from cpython.datetime cimport datetime diff --git a/pandas/_libs/tslibs/period.pyx b/pandas/_libs/tslibs/period.pyx index 6a257d40dd44b..7c7875f2a6a65 100644 --- a/pandas/_libs/tslibs/period.pyx +++ b/pandas/_libs/tslibs/period.pyx @@ -1,5 +1,5 @@ # -*- coding: utf-8 -*- -from datetime import datetime, date +from datetime import datetime from cpython cimport ( PyObject_RichCompareBool, diff --git a/pandas/_libs/tslibs/resolution.pyx b/pandas/_libs/tslibs/resolution.pyx index 4acffdea78f55..3760c88a38c39 100644 --- a/pandas/_libs/tslibs/resolution.pyx +++ b/pandas/_libs/tslibs/resolution.pyx @@ -1,7 +1,5 @@ # -*- coding: utf-8 -*- -from cython import Py_ssize_t - import numpy as np from numpy cimport ndarray, int64_t, int32_t diff --git a/pandas/_libs/tslibs/strptime.pyx b/pandas/_libs/tslibs/strptime.pyx index 46a1145009857..732caf1c0da7f 100644 --- a/pandas/_libs/tslibs/strptime.pyx +++ b/pandas/_libs/tslibs/strptime.pyx @@ -20,8 +20,6 @@ except: except: from _dummy_thread import allocate_lock as _thread_allocate_lock -from cython import Py_ssize_t - import pytz @@ -69,7 +67,7 @@ def array_strptime(object[:] values, object fmt, values : ndarray of string-like objects fmt : string-like regex exact : matches must be exact if True, search if False - coerce : if invalid values found, coerce to NaT + errors : string specifying error handling, {'raise', 'ignore', 'coerce'} """ cdef: diff --git a/pandas/_libs/tslibs/timedeltas.pyx b/pandas/_libs/tslibs/timedeltas.pyx index 219d3cd3ece75..a03f460c9e2d2 100644 --- a/pandas/_libs/tslibs/timedeltas.pyx +++ b/pandas/_libs/tslibs/timedeltas.pyx @@ -7,7 +7,6 @@ import sys cdef bint PY3 = (sys.version_info[0] >= 3) import cython -from cython import Py_ssize_t from cpython cimport Py_NE, Py_EQ, PyObject_RichCompare diff --git a/pandas/_libs/tslibs/timezones.pyx b/pandas/_libs/tslibs/timezones.pyx index 9f8922b274abd..b67aab1452ebd 100644 --- a/pandas/_libs/tslibs/timezones.pyx +++ b/pandas/_libs/tslibs/timezones.pyx @@ -1,9 +1,5 @@ # -*- coding: utf-8 -*- -from cython import Py_ssize_t - -from cpython.datetime cimport tzinfo - # dateutil compat from dateutil.tz import ( tzutc as _dateutil_tzutc, From 759a010ed8aee530203e82ad6a41962882267791 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Sun, 23 Dec 2018 07:36:40 -0800 Subject: [PATCH 09/15] Rendering Methods portions of #24024 (#24392) --- pandas/core/arrays/datetimelike.py | 88 ++++++++++++++++++------------ pandas/core/arrays/datetimes.py | 12 ++++ pandas/core/arrays/period.py | 10 ++-- pandas/core/arrays/timedeltas.py | 10 ++++ 4 files changed, 80 insertions(+), 40 deletions(-) diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index b8364760caa37..0463a30cd1135 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -14,7 +14,7 @@ import pandas.compat as compat from pandas.errors import ( AbstractMethodError, NullFrequencyError, PerformanceWarning) -from pandas.util._decorators import Appender, deprecate_kwarg +from pandas.util._decorators import Appender, Substitution, deprecate_kwarg from pandas.core.dtypes.common import ( is_bool_dtype, is_datetime64_any_dtype, is_datetime64_dtype, @@ -86,44 +86,45 @@ class DatelikeOps(object): Common ops for DatetimeIndex/PeriodIndex, but not TimedeltaIndex. """ + @Substitution(URL="https://docs.python.org/3/library/datetime.html" + "#strftime-and-strptime-behavior") def strftime(self, date_format): - from pandas import Index - return Index(self.format(date_format=date_format), - dtype=compat.text_type) - strftime.__doc__ = """ - Convert to Index using specified date_format. + """ + Convert to Index using specified date_format. - Return an Index of formatted strings specified by date_format, which - supports the same string format as the python standard library. Details - of the string format can be found in `python string format doc <{0}>`__ + Return an Index of formatted strings specified by date_format, which + supports the same string format as the python standard library. Details + of the string format can be found in `python string format + doc <%(URL)s>`__ - Parameters - ---------- - date_format : str - Date format string (e.g. "%Y-%m-%d"). + Parameters + ---------- + date_format : str + Date format string (e.g. "%%Y-%%m-%%d"). - Returns - ------- - Index - Index of formatted strings - - See Also - -------- - to_datetime : Convert the given argument to datetime. - DatetimeIndex.normalize : Return DatetimeIndex with times to midnight. - DatetimeIndex.round : Round the DatetimeIndex to the specified freq. - DatetimeIndex.floor : Floor the DatetimeIndex to the specified freq. - - Examples - -------- - >>> rng = pd.date_range(pd.Timestamp("2018-03-10 09:00"), - ... periods=3, freq='s') - >>> rng.strftime('%B %d, %Y, %r') - Index(['March 10, 2018, 09:00:00 AM', 'March 10, 2018, 09:00:01 AM', - 'March 10, 2018, 09:00:02 AM'], - dtype='object') - """.format("https://docs.python.org/3/library/datetime.html" - "#strftime-and-strptime-behavior") + Returns + ------- + Index + Index of formatted strings + + See Also + -------- + to_datetime : Convert the given argument to datetime. + DatetimeIndex.normalize : Return DatetimeIndex with times to midnight. + DatetimeIndex.round : Round the DatetimeIndex to the specified freq. + DatetimeIndex.floor : Floor the DatetimeIndex to the specified freq. + + Examples + -------- + >>> rng = pd.date_range(pd.Timestamp("2018-03-10 09:00"), + ... periods=3, freq='s') + >>> rng.strftime('%%B %%d, %%Y, %%r') + Index(['March 10, 2018, 09:00:00 AM', 'March 10, 2018, 09:00:01 AM', + 'March 10, 2018, 09:00:02 AM'], + dtype='object') + """ + from pandas import Index + return Index(self._format_native_types(date_format=date_format)) class TimelikeOps(object): @@ -298,6 +299,23 @@ def asi8(self): # do not cache or you'll create a memory leak return self._data.view('i8') + # ---------------------------------------------------------------- + # Rendering Methods + + def _format_native_types(self, na_rep=u'NaT', date_format=None): + """ + Helper method for astype when converting to strings. + + Returns + ------- + ndarray[str] + """ + raise AbstractMethodError(self) + + def _formatter(self, boxed=False): + # TODO: Remove Datetime & DatetimeTZ formatters. + return "'{}'".format + # ---------------------------------------------------------------- # Array-Like / EA-Interface Methods diff --git a/pandas/core/arrays/datetimes.py b/pandas/core/arrays/datetimes.py index c197d6d6e634b..59e9fe49f650a 100644 --- a/pandas/core/arrays/datetimes.py +++ b/pandas/core/arrays/datetimes.py @@ -468,6 +468,18 @@ def _validate_fill_value(self, fill_value): "Got '{got}'.".format(got=fill_value)) return fill_value + # ----------------------------------------------------------------- + # Rendering Methods + + def _format_native_types(self, na_rep=u'NaT', date_format=None, **kwargs): + from pandas.io.formats.format import _get_format_datetime64_from_values + fmt = _get_format_datetime64_from_values(self, date_format) + + return tslib.format_array_from_datetime(self.asi8, + tz=self.tz, + format=fmt, + na_rep=na_rep) + # ----------------------------------------------------------------- # Comparison Methods diff --git a/pandas/core/arrays/period.py b/pandas/core/arrays/period.py index 60febc5f5636d..6fd98bb25380a 100644 --- a/pandas/core/arrays/period.py +++ b/pandas/core/arrays/period.py @@ -92,7 +92,9 @@ def wrapper(self, other): return compat.set_function_name(wrapper, opname, cls) -class PeriodArray(dtl.DatetimeLikeArrayMixin, ExtensionArray): +class PeriodArray(dtl.DatetimeLikeArrayMixin, + dtl.DatelikeOps, + ExtensionArray): """ Pandas ExtensionArray for storing Period data. @@ -565,7 +567,7 @@ def asfreq(self, freq=None, how='E'): return type(self)(new_data, freq=freq) # ------------------------------------------------------------------ - # Formatting + # Rendering Methods def _format_native_types(self, na_rep=u'NaT', date_format=None, **kwargs): """ @@ -589,9 +591,7 @@ def _format_native_types(self, na_rep=u'NaT', date_format=None, **kwargs): values = np.array([formatter(dt) for dt in values]) return values - # Delegation... - def strftime(self, date_format): - return self._format_native_types(date_format=date_format) + # ------------------------------------------------------------------ def repeat(self, repeats, *args, **kwargs): """ diff --git a/pandas/core/arrays/timedeltas.py b/pandas/core/arrays/timedeltas.py index a5d074df338ee..314a3948f1032 100644 --- a/pandas/core/arrays/timedeltas.py +++ b/pandas/core/arrays/timedeltas.py @@ -231,6 +231,16 @@ def _validate_fill_value(self, fill_value): "Got '{got}'.".format(got=fill_value)) return fill_value + # ---------------------------------------------------------------- + # Rendering Methods + + def _formatter(self, boxed=False): + from pandas.io.formats.format import _get_format_timedelta64 + return _get_format_timedelta64(self, box=True) + + def _format_native_types(self): + return self.astype(object) + # ---------------------------------------------------------------- # Arithmetic Methods From 2d3e1037a06ddb47852e592389d1262b80778f0a Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Sun, 23 Dec 2018 07:45:31 -0800 Subject: [PATCH 10/15] REF/TST: collect reduction tests (#24367) --- pandas/tests/reductions/__init__.py | 4 + pandas/tests/reductions/test_reductions.py | 817 ++++++++++++++++++ .../tests/reductions/test_stat_reductions.py | 202 +++++ pandas/tests/series/test_analytics.py | 764 +--------------- pandas/tests/series/test_datetime_values.py | 18 - pandas/tests/series/test_operators.py | 47 - pandas/tests/series/test_timeseries.py | 29 - pandas/tests/test_base.py | 314 +++---- 8 files changed, 1148 insertions(+), 1047 deletions(-) create mode 100644 pandas/tests/reductions/__init__.py create mode 100644 pandas/tests/reductions/test_reductions.py create mode 100644 pandas/tests/reductions/test_stat_reductions.py diff --git a/pandas/tests/reductions/__init__.py b/pandas/tests/reductions/__init__.py new file mode 100644 index 0000000000000..e3851753b6742 --- /dev/null +++ b/pandas/tests/reductions/__init__.py @@ -0,0 +1,4 @@ +""" +Tests for reductions where we want to test for matching behavior across +Array, Index, Series, and DataFrame methods. +""" diff --git a/pandas/tests/reductions/test_reductions.py b/pandas/tests/reductions/test_reductions.py new file mode 100644 index 0000000000000..e7f984919d80b --- /dev/null +++ b/pandas/tests/reductions/test_reductions.py @@ -0,0 +1,817 @@ +# -*- coding: utf-8 -*- +from datetime import datetime + +import numpy as np +import pytest + +import pandas as pd +from pandas import Categorical, DataFrame, Index, PeriodIndex, Series, compat +from pandas.core import nanops +import pandas.util.testing as tm + + +def get_objs(): + indexes = [ + tm.makeBoolIndex(10, name='a'), + tm.makeIntIndex(10, name='a'), + tm.makeFloatIndex(10, name='a'), + tm.makeDateIndex(10, name='a'), + tm.makeDateIndex(10, name='a').tz_localize(tz='US/Eastern'), + tm.makePeriodIndex(10, name='a'), + tm.makeStringIndex(10, name='a'), + tm.makeUnicodeIndex(10, name='a') + ] + + arr = np.random.randn(10) + series = [Series(arr, index=idx, name='a') for idx in indexes] + + objs = indexes + series + return objs + + +objs = get_objs() + + +class TestReductions(object): + + @pytest.mark.parametrize('opname', ['max', 'min']) + @pytest.mark.parametrize('obj', objs) + def test_ops(self, opname, obj): + result = getattr(obj, opname)() + if not isinstance(obj, PeriodIndex): + expected = getattr(obj.values, opname)() + else: + expected = pd.Period( + ordinal=getattr(obj._ndarray_values, opname)(), + freq=obj.freq) + try: + assert result == expected + except TypeError: + # comparing tz-aware series with np.array results in + # TypeError + expected = expected.astype('M8[ns]').astype('int64') + assert result.value == expected + + def test_nanops(self): + # GH#7261 + for opname in ['max', 'min']: + for klass in [Index, Series]: + + obj = klass([np.nan, 2.0]) + assert getattr(obj, opname)() == 2.0 + + obj = klass([np.nan]) + assert pd.isna(getattr(obj, opname)()) + + obj = klass([]) + assert pd.isna(getattr(obj, opname)()) + + obj = klass([pd.NaT, datetime(2011, 11, 1)]) + # check DatetimeIndex monotonic path + assert getattr(obj, opname)() == datetime(2011, 11, 1) + + obj = klass([pd.NaT, datetime(2011, 11, 1), pd.NaT]) + # check DatetimeIndex non-monotonic path + assert getattr(obj, opname)(), datetime(2011, 11, 1) + + # argmin/max + obj = Index(np.arange(5, dtype='int64')) + assert obj.argmin() == 0 + assert obj.argmax() == 4 + + obj = Index([np.nan, 1, np.nan, 2]) + assert obj.argmin() == 1 + assert obj.argmax() == 3 + + obj = Index([np.nan]) + assert obj.argmin() == -1 + assert obj.argmax() == -1 + + obj = Index([pd.NaT, datetime(2011, 11, 1), datetime(2011, 11, 2), + pd.NaT]) + assert obj.argmin() == 1 + assert obj.argmax() == 2 + + obj = Index([pd.NaT]) + assert obj.argmin() == -1 + assert obj.argmax() == -1 + + +class TestSeriesReductions(object): + # Note: the name TestSeriesReductions indicates these tests + # were moved from a series-specific test file, _not_ that these tests are + # intended long-term to be series-specific + + def test_sum_inf(self): + s = Series(np.random.randn(10)) + s2 = s.copy() + + s[5:8] = np.inf + s2[5:8] = np.nan + + assert np.isinf(s.sum()) + + arr = np.random.randn(100, 100).astype('f4') + arr[:, 2] = np.inf + + with pd.option_context("mode.use_inf_as_na", True): + tm.assert_almost_equal(s.sum(), s2.sum()) + + res = nanops.nansum(arr, axis=1) + assert np.isinf(res).all() + + @pytest.mark.parametrize("use_bottleneck", [True, False]) + @pytest.mark.parametrize("method, unit", [ + ("sum", 0.0), + ("prod", 1.0) + ]) + def test_empty(self, method, unit, use_bottleneck): + with pd.option_context("use_bottleneck", use_bottleneck): + # GH#9422 / GH#18921 + # Entirely empty + s = Series([]) + # NA by default + result = getattr(s, method)() + assert result == unit + + # Explicit + result = getattr(s, method)(min_count=0) + assert result == unit + + result = getattr(s, method)(min_count=1) + assert pd.isna(result) + + # Skipna, default + result = getattr(s, method)(skipna=True) + result == unit + + # Skipna, explicit + result = getattr(s, method)(skipna=True, min_count=0) + assert result == unit + + result = getattr(s, method)(skipna=True, min_count=1) + assert pd.isna(result) + + # All-NA + s = Series([np.nan]) + # NA by default + result = getattr(s, method)() + assert result == unit + + # Explicit + result = getattr(s, method)(min_count=0) + assert result == unit + + result = getattr(s, method)(min_count=1) + assert pd.isna(result) + + # Skipna, default + result = getattr(s, method)(skipna=True) + result == unit + + # skipna, explicit + result = getattr(s, method)(skipna=True, min_count=0) + assert result == unit + + result = getattr(s, method)(skipna=True, min_count=1) + assert pd.isna(result) + + # Mix of valid, empty + s = Series([np.nan, 1]) + # Default + result = getattr(s, method)() + assert result == 1.0 + + # Explicit + result = getattr(s, method)(min_count=0) + assert result == 1.0 + + result = getattr(s, method)(min_count=1) + assert result == 1.0 + + # Skipna + result = getattr(s, method)(skipna=True) + assert result == 1.0 + + result = getattr(s, method)(skipna=True, min_count=0) + assert result == 1.0 + + result = getattr(s, method)(skipna=True, min_count=1) + assert result == 1.0 + + # GH#844 (changed in GH#9422) + df = DataFrame(np.empty((10, 0))) + assert (getattr(df, method)(1) == unit).all() + + s = pd.Series([1]) + result = getattr(s, method)(min_count=2) + assert pd.isna(result) + + s = pd.Series([np.nan]) + result = getattr(s, method)(min_count=2) + assert pd.isna(result) + + s = pd.Series([np.nan, 1]) + result = getattr(s, method)(min_count=2) + assert pd.isna(result) + + @pytest.mark.parametrize('method, unit', [ + ('sum', 0.0), + ('prod', 1.0), + ]) + def test_empty_multi(self, method, unit): + s = pd.Series([1, np.nan, np.nan, np.nan], + index=pd.MultiIndex.from_product([('a', 'b'), (0, 1)])) + # 1 / 0 by default + result = getattr(s, method)(level=0) + expected = pd.Series([1, unit], index=['a', 'b']) + tm.assert_series_equal(result, expected) + + # min_count=0 + result = getattr(s, method)(level=0, min_count=0) + expected = pd.Series([1, unit], index=['a', 'b']) + tm.assert_series_equal(result, expected) + + # min_count=1 + result = getattr(s, method)(level=0, min_count=1) + expected = pd.Series([1, np.nan], index=['a', 'b']) + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize( + "method", ['mean', 'median', 'std', 'var']) + def test_ops_consistency_on_empty(self, method): + + # GH#7869 + # consistency on empty + + # float + result = getattr(Series(dtype=float), method)() + assert pd.isna(result) + + # timedelta64[ns] + result = getattr(Series(dtype='m8[ns]'), method)() + assert result is pd.NaT + + def test_nansum_buglet(self): + ser = Series([1.0, np.nan], index=[0, 1]) + result = np.nansum(ser) + tm.assert_almost_equal(result, 1) + + @pytest.mark.parametrize("use_bottleneck", [True, False]) + def test_sum_overflow(self, use_bottleneck): + + with pd.option_context('use_bottleneck', use_bottleneck): + # GH#6915 + # overflowing on the smaller int dtypes + for dtype in ['int32', 'int64']: + v = np.arange(5000000, dtype=dtype) + s = Series(v) + + result = s.sum(skipna=False) + assert int(result) == v.sum(dtype='int64') + result = s.min(skipna=False) + assert int(result) == 0 + result = s.max(skipna=False) + assert int(result) == v[-1] + + for dtype in ['float32', 'float64']: + v = np.arange(5000000, dtype=dtype) + s = Series(v) + + result = s.sum(skipna=False) + assert result == v.sum(dtype=dtype) + result = s.min(skipna=False) + assert np.allclose(float(result), 0.0) + result = s.max(skipna=False) + assert np.allclose(float(result), v[-1]) + + def test_empty_timeseries_reductions_return_nat(self): + # covers GH#11245 + for dtype in ('m8[ns]', 'm8[ns]', 'M8[ns]', 'M8[ns, UTC]'): + assert Series([], dtype=dtype).min() is pd.NaT + assert Series([], dtype=dtype).max() is pd.NaT + + def test_numpy_argmin_deprecated(self): + # See GH#16830 + data = np.arange(1, 11) + + s = Series(data, index=data) + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + # The deprecation of Series.argmin also causes a deprecation + # warning when calling np.argmin. This behavior is temporary + # until the implementation of Series.argmin is corrected. + result = np.argmin(s) + + assert result == 1 + + with tm.assert_produces_warning(FutureWarning): + # argmin is aliased to idxmin + result = s.argmin() + + assert result == 1 + + with tm.assert_produces_warning(FutureWarning, + check_stacklevel=False): + msg = "the 'out' parameter is not supported" + with pytest.raises(ValueError, match=msg): + np.argmin(s, out=data) + + def test_numpy_argmax_deprecated(self): + # See GH#16830 + data = np.arange(1, 11) + + s = Series(data, index=data) + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + # The deprecation of Series.argmax also causes a deprecation + # warning when calling np.argmax. This behavior is temporary + # until the implementation of Series.argmax is corrected. + result = np.argmax(s) + assert result == 10 + + with tm.assert_produces_warning(FutureWarning): + # argmax is aliased to idxmax + result = s.argmax() + + assert result == 10 + + with tm.assert_produces_warning(FutureWarning, + check_stacklevel=False): + msg = "the 'out' parameter is not supported" + with pytest.raises(ValueError, match=msg): + np.argmax(s, out=data) + + def test_idxmin(self): + # test idxmin + # _check_stat_op approach can not be used here because of isna check. + string_series = tm.makeStringSeries().rename('series') + + # add some NaNs + string_series[5:15] = np.NaN + + # skipna or no + assert string_series[string_series.idxmin()] == string_series.min() + assert pd.isna(string_series.idxmin(skipna=False)) + + # no NaNs + nona = string_series.dropna() + assert nona[nona.idxmin()] == nona.min() + assert (nona.index.values.tolist().index(nona.idxmin()) == + nona.values.argmin()) + + # all NaNs + allna = string_series * np.nan + assert pd.isna(allna.idxmin()) + + # datetime64[ns] + s = Series(pd.date_range('20130102', periods=6)) + result = s.idxmin() + assert result == 0 + + s[0] = np.nan + result = s.idxmin() + assert result == 1 + + def test_idxmax(self): + # test idxmax + # _check_stat_op approach can not be used here because of isna check. + string_series = tm.makeStringSeries().rename('series') + + # add some NaNs + string_series[5:15] = np.NaN + + # skipna or no + assert string_series[string_series.idxmax()] == string_series.max() + assert pd.isna(string_series.idxmax(skipna=False)) + + # no NaNs + nona = string_series.dropna() + assert nona[nona.idxmax()] == nona.max() + assert (nona.index.values.tolist().index(nona.idxmax()) == + nona.values.argmax()) + + # all NaNs + allna = string_series * np.nan + assert pd.isna(allna.idxmax()) + + from pandas import date_range + s = Series(date_range('20130102', periods=6)) + result = s.idxmax() + assert result == 5 + + s[5] = np.nan + result = s.idxmax() + assert result == 4 + + # Float64Index + # GH#5914 + s = pd.Series([1, 2, 3], [1.1, 2.1, 3.1]) + result = s.idxmax() + assert result == 3.1 + result = s.idxmin() + assert result == 1.1 + + s = pd.Series(s.index, s.index) + result = s.idxmax() + assert result == 3.1 + result = s.idxmin() + assert result == 1.1 + + def test_all_any(self): + ts = tm.makeTimeSeries() + bool_series = ts > 0 + assert not bool_series.all() + assert bool_series.any() + + # Alternative types, with implicit 'object' dtype. + s = Series(['abc', True]) + assert 'abc' == s.any() # 'abc' || True => 'abc' + + def test_all_any_params(self): + # Check skipna, with implicit 'object' dtype. + s1 = Series([np.nan, True]) + s2 = Series([np.nan, False]) + assert s1.all(skipna=False) # nan && True => True + assert s1.all(skipna=True) + assert np.isnan(s2.any(skipna=False)) # nan || False => nan + assert not s2.any(skipna=True) + + # Check level. + s = pd.Series([False, False, True, True, False, True], + index=[0, 0, 1, 1, 2, 2]) + tm.assert_series_equal(s.all(level=0), Series([False, True, False])) + tm.assert_series_equal(s.any(level=0), Series([False, True, True])) + + # bool_only is not implemented with level option. + with pytest.raises(NotImplementedError): + s.any(bool_only=True, level=0) + with pytest.raises(NotImplementedError): + s.all(bool_only=True, level=0) + + # bool_only is not implemented alone. + with pytest.raises(NotImplementedError): + s.any(bool_only=True,) + with pytest.raises(NotImplementedError): + s.all(bool_only=True) + + def test_timedelta64_analytics(self): + + # index min/max + dti = pd.date_range('2012-1-1', periods=3, freq='D') + td = Series(dti) - pd.Timestamp('20120101') + + result = td.idxmin() + assert result == 0 + + result = td.idxmax() + assert result == 2 + + # GH#2982 + # with NaT + td[0] = np.nan + + result = td.idxmin() + assert result == 1 + + result = td.idxmax() + assert result == 2 + + # abs + s1 = Series(pd.date_range('20120101', periods=3)) + s2 = Series(pd.date_range('20120102', periods=3)) + expected = Series(s2 - s1) + + # FIXME: don't leave commented-out code + # this fails as numpy returns timedelta64[us] + # result = np.abs(s1-s2) + # assert_frame_equal(result,expected) + + result = (s1 - s2).abs() + tm.assert_series_equal(result, expected) + + # max/min + result = td.max() + expected = pd.Timedelta('2 days') + assert result == expected + + result = td.min() + expected = pd.Timedelta('1 days') + assert result == expected + + @pytest.mark.parametrize( + "test_input,error_type", + [ + (pd.Series([]), ValueError), + + # For strings, or any Series with dtype 'O' + (pd.Series(['foo', 'bar', 'baz']), TypeError), + (pd.Series([(1,), (2,)]), TypeError), + + # For mixed data types + ( + pd.Series(['foo', 'foo', 'bar', 'bar', None, np.nan, 'baz']), + TypeError + ), + ] + ) + def test_assert_idxminmax_raises(self, test_input, error_type): + """ + Cases where ``Series.argmax`` and related should raise an exception + """ + with pytest.raises(error_type): + test_input.idxmin() + with pytest.raises(error_type): + test_input.idxmin(skipna=False) + with pytest.raises(error_type): + test_input.idxmax() + with pytest.raises(error_type): + test_input.idxmax(skipna=False) + + def test_idxminmax_with_inf(self): + # For numeric data with NA and Inf (GH #13595) + s = pd.Series([0, -np.inf, np.inf, np.nan]) + + assert s.idxmin() == 1 + assert np.isnan(s.idxmin(skipna=False)) + + assert s.idxmax() == 2 + assert np.isnan(s.idxmax(skipna=False)) + + # Using old-style behavior that treats floating point nan, -inf, and + # +inf as missing + with pd.option_context('mode.use_inf_as_na', True): + assert s.idxmin() == 0 + assert np.isnan(s.idxmin(skipna=False)) + assert s.idxmax() == 0 + np.isnan(s.idxmax(skipna=False)) + + +class TestDatetime64SeriesReductions(object): + # Note: the name TestDatetime64SeriesReductions indicates these tests + # were moved from a series-specific test file, _not_ that these tests are + # intended long-term to be series-specific + + @pytest.mark.parametrize('nat_ser', [ + Series([pd.NaT, pd.NaT]), + Series([pd.NaT, pd.Timedelta('nat')]), + Series([pd.Timedelta('nat'), pd.Timedelta('nat')])]) + def test_minmax_nat_series(self, nat_ser): + # GH#23282 + assert nat_ser.min() is pd.NaT + assert nat_ser.max() is pd.NaT + + @pytest.mark.parametrize('nat_df', [ + pd.DataFrame([pd.NaT, pd.NaT]), + pd.DataFrame([pd.NaT, pd.Timedelta('nat')]), + pd.DataFrame([pd.Timedelta('nat'), pd.Timedelta('nat')])]) + def test_minmax_nat_dataframe(self, nat_df): + # GH#23282 + assert nat_df.min()[0] is pd.NaT + assert nat_df.max()[0] is pd.NaT + + def test_min_max(self): + rng = pd.date_range('1/1/2000', '12/31/2000') + rng2 = rng.take(np.random.permutation(len(rng))) + + the_min = rng2.min() + the_max = rng2.max() + assert isinstance(the_min, pd.Timestamp) + assert isinstance(the_max, pd.Timestamp) + assert the_min == rng[0] + assert the_max == rng[-1] + + assert rng.min() == rng[0] + assert rng.max() == rng[-1] + + def test_min_max_series(self): + rng = pd.date_range('1/1/2000', periods=10, freq='4h') + lvls = ['A', 'A', 'A', 'B', 'B', 'B', 'C', 'C', 'C', 'C'] + df = DataFrame({'TS': rng, 'V': np.random.randn(len(rng)), 'L': lvls}) + + result = df.TS.max() + exp = pd.Timestamp(df.TS.iat[-1]) + assert isinstance(result, pd.Timestamp) + assert result == exp + + result = df.TS.min() + exp = pd.Timestamp(df.TS.iat[0]) + assert isinstance(result, pd.Timestamp) + assert result == exp + + +class TestCategoricalSeriesReductions(object): + # Note: the name TestCategoricalSeriesReductions indicates these tests + # were moved from a series-specific test file, _not_ that these tests are + # intended long-term to be series-specific + + def test_min_max(self): + # unordered cats have no min/max + cat = Series(Categorical(["a", "b", "c", "d"], ordered=False)) + with pytest.raises(TypeError): + cat.min() + with pytest.raises(TypeError): + cat.max() + + cat = Series(Categorical(["a", "b", "c", "d"], ordered=True)) + _min = cat.min() + _max = cat.max() + assert _min == "a" + assert _max == "d" + + cat = Series(Categorical(["a", "b", "c", "d"], categories=[ + 'd', 'c', 'b', 'a'], ordered=True)) + _min = cat.min() + _max = cat.max() + assert _min == "d" + assert _max == "a" + + cat = Series(Categorical( + [np.nan, "b", "c", np.nan], categories=['d', 'c', 'b', 'a' + ], ordered=True)) + _min = cat.min() + _max = cat.max() + assert np.isnan(_min) + assert _max == "b" + + cat = Series(Categorical( + [np.nan, 1, 2, np.nan], categories=[5, 4, 3, 2, 1], ordered=True)) + _min = cat.min() + _max = cat.max() + assert np.isnan(_min) + assert _max == 1 + + +class TestSeriesMode(object): + # Note: the name TestSeriesMode indicates these tests + # were moved from a series-specific test file, _not_ that these tests are + # intended long-term to be series-specific + + @pytest.mark.parametrize('dropna, expected', [ + (True, Series([], dtype=np.float64)), + (False, Series([], dtype=np.float64)) + ]) + def test_mode_empty(self, dropna, expected): + s = Series([], dtype=np.float64) + result = s.mode(dropna) + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize('dropna, data, expected', [ + (True, [1, 1, 1, 2], [1]), + (True, [1, 1, 1, 2, 3, 3, 3], [1, 3]), + (False, [1, 1, 1, 2], [1]), + (False, [1, 1, 1, 2, 3, 3, 3], [1, 3]), + ]) + @pytest.mark.parametrize( + 'dt', + list(np.typecodes['AllInteger'] + np.typecodes['Float']) + ) + def test_mode_numerical(self, dropna, data, expected, dt): + s = Series(data, dtype=dt) + result = s.mode(dropna) + expected = Series(expected, dtype=dt) + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize('dropna, expected', [ + (True, [1.0]), + (False, [1, np.nan]), + ]) + def test_mode_numerical_nan(self, dropna, expected): + s = Series([1, 1, 2, np.nan, np.nan]) + result = s.mode(dropna) + expected = Series(expected) + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize('dropna, expected1, expected2, expected3', [ + (True, ['b'], ['bar'], ['nan']), + (False, ['b'], [np.nan], ['nan']) + ]) + def test_mode_str_obj(self, dropna, expected1, expected2, expected3): + # Test string and object types. + data = ['a'] * 2 + ['b'] * 3 + + s = Series(data, dtype='c') + result = s.mode(dropna) + expected1 = Series(expected1, dtype='c') + tm.assert_series_equal(result, expected1) + + data = ['foo', 'bar', 'bar', np.nan, np.nan, np.nan] + + s = Series(data, dtype=object) + result = s.mode(dropna) + expected2 = Series(expected2, dtype=object) + tm.assert_series_equal(result, expected2) + + data = ['foo', 'bar', 'bar', np.nan, np.nan, np.nan] + + s = Series(data, dtype=object).astype(str) + result = s.mode(dropna) + expected3 = Series(expected3, dtype=str) + tm.assert_series_equal(result, expected3) + + @pytest.mark.parametrize('dropna, expected1, expected2', [ + (True, ['foo'], ['foo']), + (False, ['foo'], [np.nan]) + ]) + def test_mode_mixeddtype(self, dropna, expected1, expected2): + s = Series([1, 'foo', 'foo']) + result = s.mode(dropna) + expected = Series(expected1) + tm.assert_series_equal(result, expected) + + s = Series([1, 'foo', 'foo', np.nan, np.nan, np.nan]) + result = s.mode(dropna) + expected = Series(expected2, dtype=object) + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize('dropna, expected1, expected2', [ + (True, ['1900-05-03', '2011-01-03', '2013-01-02'], + ['2011-01-03', '2013-01-02']), + (False, [np.nan], [np.nan, '2011-01-03', '2013-01-02']), + ]) + def test_mode_datetime(self, dropna, expected1, expected2): + s = Series(['2011-01-03', '2013-01-02', + '1900-05-03', 'nan', 'nan'], dtype='M8[ns]') + result = s.mode(dropna) + expected1 = Series(expected1, dtype='M8[ns]') + tm.assert_series_equal(result, expected1) + + s = Series(['2011-01-03', '2013-01-02', '1900-05-03', + '2011-01-03', '2013-01-02', 'nan', 'nan'], + dtype='M8[ns]') + result = s.mode(dropna) + expected2 = Series(expected2, dtype='M8[ns]') + tm.assert_series_equal(result, expected2) + + @pytest.mark.parametrize('dropna, expected1, expected2', [ + (True, ['-1 days', '0 days', '1 days'], ['2 min', '1 day']), + (False, [np.nan], [np.nan, '2 min', '1 day']), + ]) + def test_mode_timedelta(self, dropna, expected1, expected2): + # gh-5986: Test timedelta types. + + s = Series(['1 days', '-1 days', '0 days', 'nan', 'nan'], + dtype='timedelta64[ns]') + result = s.mode(dropna) + expected1 = Series(expected1, dtype='timedelta64[ns]') + tm.assert_series_equal(result, expected1) + + s = Series(['1 day', '1 day', '-1 day', '-1 day 2 min', + '2 min', '2 min', 'nan', 'nan'], + dtype='timedelta64[ns]') + result = s.mode(dropna) + expected2 = Series(expected2, dtype='timedelta64[ns]') + tm.assert_series_equal(result, expected2) + + @pytest.mark.parametrize('dropna, expected1, expected2, expected3', [ + (True, Categorical([1, 2], categories=[1, 2]), + Categorical(['a'], categories=[1, 'a']), + Categorical([3, 1], categories=[3, 2, 1], ordered=True)), + (False, Categorical([np.nan], categories=[1, 2]), + Categorical([np.nan, 'a'], categories=[1, 'a']), + Categorical([np.nan, 3, 1], categories=[3, 2, 1], ordered=True)), + ]) + def test_mode_category(self, dropna, expected1, expected2, expected3): + s = Series(Categorical([1, 2, np.nan, np.nan])) + result = s.mode(dropna) + expected1 = Series(expected1, dtype='category') + tm.assert_series_equal(result, expected1) + + s = Series(Categorical([1, 'a', 'a', np.nan, np.nan])) + result = s.mode(dropna) + expected2 = Series(expected2, dtype='category') + tm.assert_series_equal(result, expected2) + + s = Series(Categorical([1, 1, 2, 3, 3, np.nan, np.nan], + categories=[3, 2, 1], ordered=True)) + result = s.mode(dropna) + expected3 = Series(expected3, dtype='category') + tm.assert_series_equal(result, expected3) + + @pytest.mark.parametrize('dropna, expected1, expected2', [ + (True, [2**63], [1, 2**63]), + (False, [2**63], [1, 2**63]) + ]) + def test_mode_intoverflow(self, dropna, expected1, expected2): + # Test for uint64 overflow. + s = Series([1, 2**63, 2**63], dtype=np.uint64) + result = s.mode(dropna) + expected1 = Series(expected1, dtype=np.uint64) + tm.assert_series_equal(result, expected1) + + s = Series([1, 2**63], dtype=np.uint64) + result = s.mode(dropna) + expected2 = Series(expected2, dtype=np.uint64) + tm.assert_series_equal(result, expected2) + + @pytest.mark.skipif(not compat.PY3, reason="only PY3") + def test_mode_sortwarning(self): + # Check for the warning that is raised when the mode + # results cannot be sorted + + expected = Series(['foo', np.nan]) + s = Series([1, 'foo', 'foo', np.nan, np.nan]) + + with tm.assert_produces_warning(UserWarning, check_stacklevel=False): + result = s.mode(dropna=False) + result = result.sort_values().reset_index(drop=True) + + tm.assert_series_equal(result, expected) diff --git a/pandas/tests/reductions/test_stat_reductions.py b/pandas/tests/reductions/test_stat_reductions.py new file mode 100644 index 0000000000000..1146e0793d4f5 --- /dev/null +++ b/pandas/tests/reductions/test_stat_reductions.py @@ -0,0 +1,202 @@ +# -*- coding: utf-8 -*- +""" +Tests for statistical reductions of 2nd moment or higher: var, skew, kurt, ... +""" + +import numpy as np +import pytest + +from pandas.compat import lrange +import pandas.util._test_decorators as td + +import pandas as pd +from pandas import DataFrame, Series, compat +import pandas.util.testing as tm + + +class TestSeriesStatReductions(object): + # Note: the name TestSeriesStatReductions indicates these tests + # were moved from a series-specific test file, _not_ that these tests are + # intended long-term to be series-specific + + def _check_stat_op(self, name, alternate, string_series_, + check_objects=False, check_allna=False): + + with pd.option_context('use_bottleneck', False): + f = getattr(Series, name) + + # add some NaNs + string_series_[5:15] = np.NaN + + # idxmax, idxmin, min, and max are valid for dates + if name not in ['max', 'min']: + ds = Series(pd.date_range('1/1/2001', periods=10)) + with pytest.raises(TypeError): + f(ds) + + # skipna or no + assert pd.notna(f(string_series_)) + assert pd.isna(f(string_series_, skipna=False)) + + # check the result is correct + nona = string_series_.dropna() + tm.assert_almost_equal(f(nona), alternate(nona.values)) + tm.assert_almost_equal(f(string_series_), alternate(nona.values)) + + allna = string_series_ * np.nan + + if check_allna: + assert np.isnan(f(allna)) + + # dtype=object with None, it works! + s = Series([1, 2, 3, None, 5]) + f(s) + + # GH#2888 + items = [0] + items.extend(lrange(2 ** 40, 2 ** 40 + 1000)) + s = Series(items, dtype='int64') + tm.assert_almost_equal(float(f(s)), float(alternate(s.values))) + + # check date range + if check_objects: + s = Series(pd.bdate_range('1/1/2000', periods=10)) + res = f(s) + exp = alternate(s) + assert res == exp + + # check on string data + if name not in ['sum', 'min', 'max']: + with pytest.raises(TypeError): + f(Series(list('abc'))) + + # Invalid axis. + with pytest.raises(ValueError): + f(string_series_, axis=1) + + # Unimplemented numeric_only parameter. + if 'numeric_only' in compat.signature(f).args: + with pytest.raises(NotImplementedError, match=name): + f(string_series_, numeric_only=True) + + def test_sum(self): + string_series = tm.makeStringSeries().rename('series') + self._check_stat_op('sum', np.sum, string_series, check_allna=False) + + def test_mean(self): + string_series = tm.makeStringSeries().rename('series') + self._check_stat_op('mean', np.mean, string_series) + + def test_median(self): + string_series = tm.makeStringSeries().rename('series') + self._check_stat_op('median', np.median, string_series) + + # test with integers, test failure + int_ts = Series(np.ones(10, dtype=int), index=lrange(10)) + tm.assert_almost_equal(np.median(int_ts), int_ts.median()) + + def test_prod(self): + string_series = tm.makeStringSeries().rename('series') + self._check_stat_op('prod', np.prod, string_series) + + def test_min(self): + string_series = tm.makeStringSeries().rename('series') + self._check_stat_op('min', np.min, string_series, check_objects=True) + + def test_max(self): + string_series = tm.makeStringSeries().rename('series') + self._check_stat_op('max', np.max, string_series, check_objects=True) + + def test_var_std(self): + string_series = tm.makeStringSeries().rename('series') + datetime_series = tm.makeTimeSeries().rename('ts') + + alt = lambda x: np.std(x, ddof=1) + self._check_stat_op('std', alt, string_series) + + alt = lambda x: np.var(x, ddof=1) + self._check_stat_op('var', alt, string_series) + + result = datetime_series.std(ddof=4) + expected = np.std(datetime_series.values, ddof=4) + tm.assert_almost_equal(result, expected) + + result = datetime_series.var(ddof=4) + expected = np.var(datetime_series.values, ddof=4) + tm.assert_almost_equal(result, expected) + + # 1 - element series with ddof=1 + s = datetime_series.iloc[[0]] + result = s.var(ddof=1) + assert pd.isna(result) + + result = s.std(ddof=1) + assert pd.isna(result) + + def test_sem(self): + string_series = tm.makeStringSeries().rename('series') + datetime_series = tm.makeTimeSeries().rename('ts') + + alt = lambda x: np.std(x, ddof=1) / np.sqrt(len(x)) + self._check_stat_op('sem', alt, string_series) + + result = datetime_series.sem(ddof=4) + expected = np.std(datetime_series.values, + ddof=4) / np.sqrt(len(datetime_series.values)) + tm.assert_almost_equal(result, expected) + + # 1 - element series with ddof=1 + s = datetime_series.iloc[[0]] + result = s.sem(ddof=1) + assert pd.isna(result) + + @td.skip_if_no_scipy + def test_skew(self): + from scipy.stats import skew + + string_series = tm.makeStringSeries().rename('series') + + alt = lambda x: skew(x, bias=False) + self._check_stat_op('skew', alt, string_series) + + # test corner cases, skew() returns NaN unless there's at least 3 + # values + min_N = 3 + for i in range(1, min_N + 1): + s = Series(np.ones(i)) + df = DataFrame(np.ones((i, i))) + if i < min_N: + assert np.isnan(s.skew()) + assert np.isnan(df.skew()).all() + else: + assert 0 == s.skew() + assert (df.skew() == 0).all() + + @td.skip_if_no_scipy + def test_kurt(self): + from scipy.stats import kurtosis + + string_series = tm.makeStringSeries().rename('series') + + alt = lambda x: kurtosis(x, bias=False) + self._check_stat_op('kurt', alt, string_series) + + index = pd.MultiIndex( + levels=[['bar'], ['one', 'two', 'three'], [0, 1]], + codes=[[0, 0, 0, 0, 0, 0], [0, 1, 2, 0, 1, 2], [0, 1, 0, 1, 0, 1]] + ) + s = Series(np.random.randn(6), index=index) + tm.assert_almost_equal(s.kurt(), s.kurt(level=0)['bar']) + + # test corner cases, kurt() returns NaN unless there's at least 4 + # values + min_N = 4 + for i in range(1, min_N + 1): + s = Series(np.ones(i)) + df = DataFrame(np.ones((i, i))) + if i < min_N: + assert np.isnan(s.kurt()) + assert np.isnan(df.kurt()).all() + else: + assert 0 == s.kurt() + assert (df.kurt() == 0).all() diff --git a/pandas/tests/series/test_analytics.py b/pandas/tests/series/test_analytics.py index 73c6ea67ee8aa..b5140a5319c01 100644 --- a/pandas/tests/series/test_analytics.py +++ b/pandas/tests/series/test_analytics.py @@ -14,13 +14,11 @@ import pandas as pd from pandas import ( - Categorical, CategoricalIndex, DataFrame, Series, bdate_range, compat, - date_range, isna, notna) + Categorical, CategoricalIndex, DataFrame, Series, compat, date_range, isna, + notna) from pandas.api.types import is_scalar from pandas.core.index import MultiIndex from pandas.core.indexes.datetimes import Timestamp -from pandas.core.indexes.timedeltas import Timedelta -import pandas.core.nanops as nanops import pandas.util.testing as tm from pandas.util.testing import ( assert_almost_equal, assert_frame_equal, assert_index_equal, @@ -29,292 +27,6 @@ class TestSeriesAnalytics(object): - @pytest.mark.parametrize("use_bottleneck", [True, False]) - @pytest.mark.parametrize("method, unit", [ - ("sum", 0.0), - ("prod", 1.0) - ]) - def test_empty(self, method, unit, use_bottleneck): - with pd.option_context("use_bottleneck", use_bottleneck): - # GH 9422 / 18921 - # Entirely empty - s = Series([]) - # NA by default - result = getattr(s, method)() - assert result == unit - - # Explicit - result = getattr(s, method)(min_count=0) - assert result == unit - - result = getattr(s, method)(min_count=1) - assert isna(result) - - # Skipna, default - result = getattr(s, method)(skipna=True) - result == unit - - # Skipna, explicit - result = getattr(s, method)(skipna=True, min_count=0) - assert result == unit - - result = getattr(s, method)(skipna=True, min_count=1) - assert isna(result) - - # All-NA - s = Series([np.nan]) - # NA by default - result = getattr(s, method)() - assert result == unit - - # Explicit - result = getattr(s, method)(min_count=0) - assert result == unit - - result = getattr(s, method)(min_count=1) - assert isna(result) - - # Skipna, default - result = getattr(s, method)(skipna=True) - result == unit - - # skipna, explicit - result = getattr(s, method)(skipna=True, min_count=0) - assert result == unit - - result = getattr(s, method)(skipna=True, min_count=1) - assert isna(result) - - # Mix of valid, empty - s = Series([np.nan, 1]) - # Default - result = getattr(s, method)() - assert result == 1.0 - - # Explicit - result = getattr(s, method)(min_count=0) - assert result == 1.0 - - result = getattr(s, method)(min_count=1) - assert result == 1.0 - - # Skipna - result = getattr(s, method)(skipna=True) - assert result == 1.0 - - result = getattr(s, method)(skipna=True, min_count=0) - assert result == 1.0 - - result = getattr(s, method)(skipna=True, min_count=1) - assert result == 1.0 - - # GH #844 (changed in 9422) - df = DataFrame(np.empty((10, 0))) - assert (getattr(df, method)(1) == unit).all() - - s = pd.Series([1]) - result = getattr(s, method)(min_count=2) - assert isna(result) - - s = pd.Series([np.nan]) - result = getattr(s, method)(min_count=2) - assert isna(result) - - s = pd.Series([np.nan, 1]) - result = getattr(s, method)(min_count=2) - assert isna(result) - - @pytest.mark.parametrize('method, unit', [ - ('sum', 0.0), - ('prod', 1.0), - ]) - def test_empty_multi(self, method, unit): - s = pd.Series([1, np.nan, np.nan, np.nan], - index=pd.MultiIndex.from_product([('a', 'b'), (0, 1)])) - # 1 / 0 by default - result = getattr(s, method)(level=0) - expected = pd.Series([1, unit], index=['a', 'b']) - tm.assert_series_equal(result, expected) - - # min_count=0 - result = getattr(s, method)(level=0, min_count=0) - expected = pd.Series([1, unit], index=['a', 'b']) - tm.assert_series_equal(result, expected) - - # min_count=1 - result = getattr(s, method)(level=0, min_count=1) - expected = pd.Series([1, np.nan], index=['a', 'b']) - tm.assert_series_equal(result, expected) - - @pytest.mark.parametrize( - "method", ['mean', 'median', 'std', 'var']) - def test_ops_consistency_on_empty(self, method): - - # GH 7869 - # consistency on empty - - # float - result = getattr(Series(dtype=float), method)() - assert isna(result) - - # timedelta64[ns] - result = getattr(Series(dtype='m8[ns]'), method)() - assert result is pd.NaT - - def test_nansum_buglet(self): - s = Series([1.0, np.nan], index=[0, 1]) - result = np.nansum(s) - assert_almost_equal(result, 1) - - @pytest.mark.parametrize("use_bottleneck", [True, False]) - def test_sum_overflow(self, use_bottleneck): - - with pd.option_context('use_bottleneck', use_bottleneck): - # GH 6915 - # overflowing on the smaller int dtypes - for dtype in ['int32', 'int64']: - v = np.arange(5000000, dtype=dtype) - s = Series(v) - - result = s.sum(skipna=False) - assert int(result) == v.sum(dtype='int64') - result = s.min(skipna=False) - assert int(result) == 0 - result = s.max(skipna=False) - assert int(result) == v[-1] - - for dtype in ['float32', 'float64']: - v = np.arange(5000000, dtype=dtype) - s = Series(v) - - result = s.sum(skipna=False) - assert result == v.sum(dtype=dtype) - result = s.min(skipna=False) - assert np.allclose(float(result), 0.0) - result = s.max(skipna=False) - assert np.allclose(float(result), v[-1]) - - def test_sum(self, string_series): - self._check_stat_op('sum', np.sum, string_series, check_allna=False) - - def test_sum_inf(self): - s = Series(np.random.randn(10)) - s2 = s.copy() - - s[5:8] = np.inf - s2[5:8] = np.nan - - assert np.isinf(s.sum()) - - arr = np.random.randn(100, 100).astype('f4') - arr[:, 2] = np.inf - - with pd.option_context("mode.use_inf_as_na", True): - assert_almost_equal(s.sum(), s2.sum()) - - res = nanops.nansum(arr, axis=1) - assert np.isinf(res).all() - - def test_mean(self, string_series): - self._check_stat_op('mean', np.mean, string_series) - - def test_median(self, string_series): - self._check_stat_op('median', np.median, string_series) - - # test with integers, test failure - int_ts = Series(np.ones(10, dtype=int), index=lrange(10)) - tm.assert_almost_equal(np.median(int_ts), int_ts.median()) - - def test_prod(self, string_series): - self._check_stat_op('prod', np.prod, string_series) - - def test_min(self, string_series): - self._check_stat_op('min', np.min, string_series, check_objects=True) - - def test_max(self, string_series): - self._check_stat_op('max', np.max, string_series, check_objects=True) - - def test_var_std(self, datetime_series, string_series): - alt = lambda x: np.std(x, ddof=1) - self._check_stat_op('std', alt, string_series) - - alt = lambda x: np.var(x, ddof=1) - self._check_stat_op('var', alt, string_series) - - result = datetime_series.std(ddof=4) - expected = np.std(datetime_series.values, ddof=4) - assert_almost_equal(result, expected) - - result = datetime_series.var(ddof=4) - expected = np.var(datetime_series.values, ddof=4) - assert_almost_equal(result, expected) - - # 1 - element series with ddof=1 - s = datetime_series.iloc[[0]] - result = s.var(ddof=1) - assert isna(result) - - result = s.std(ddof=1) - assert isna(result) - - def test_sem(self, datetime_series, string_series): - alt = lambda x: np.std(x, ddof=1) / np.sqrt(len(x)) - self._check_stat_op('sem', alt, string_series) - - result = datetime_series.sem(ddof=4) - expected = np.std(datetime_series.values, - ddof=4) / np.sqrt(len(datetime_series.values)) - assert_almost_equal(result, expected) - - # 1 - element series with ddof=1 - s = datetime_series.iloc[[0]] - result = s.sem(ddof=1) - assert isna(result) - - @td.skip_if_no_scipy - def test_skew(self, string_series): - from scipy.stats import skew - alt = lambda x: skew(x, bias=False) - self._check_stat_op('skew', alt, string_series) - - # test corner cases, skew() returns NaN unless there's at least 3 - # values - min_N = 3 - for i in range(1, min_N + 1): - s = Series(np.ones(i)) - df = DataFrame(np.ones((i, i))) - if i < min_N: - assert np.isnan(s.skew()) - assert np.isnan(df.skew()).all() - else: - assert 0 == s.skew() - assert (df.skew() == 0).all() - - @td.skip_if_no_scipy - def test_kurt(self, string_series): - from scipy.stats import kurtosis - alt = lambda x: kurtosis(x, bias=False) - self._check_stat_op('kurt', alt, string_series) - - index = MultiIndex(levels=[['bar'], ['one', 'two', 'three'], [0, 1]], - codes=[[0, 0, 0, 0, 0, 0], [0, 1, 2, 0, 1, 2], - [0, 1, 0, 1, 0, 1]]) - s = Series(np.random.randn(6), index=index) - tm.assert_almost_equal(s.kurt(), s.kurt(level=0)['bar']) - - # test corner cases, kurt() returns NaN unless there's at least 4 - # values - min_N = 4 - for i in range(1, min_N + 1): - s = Series(np.ones(i)) - df = DataFrame(np.ones((i, i))) - if i < min_N: - assert np.isnan(s.kurt()) - assert np.isnan(df.kurt()).all() - else: - assert 0 == s.kurt() - assert (df.kurt() == 0).all() - def test_describe(self): s = Series([0, 1, 2, 3, 4], name='int_data') result = s.describe() @@ -508,63 +220,6 @@ def test_npdiff(self): r = np.diff(s) assert_series_equal(Series([nan, 0, 0, 0, nan]), r) - def _check_stat_op(self, name, alternate, string_series_, - check_objects=False, check_allna=False): - - with pd.option_context('use_bottleneck', False): - f = getattr(Series, name) - - # add some NaNs - string_series_[5:15] = np.NaN - - # idxmax, idxmin, min, and max are valid for dates - if name not in ['max', 'min']: - ds = Series(date_range('1/1/2001', periods=10)) - pytest.raises(TypeError, f, ds) - - # skipna or no - assert notna(f(string_series_)) - assert isna(f(string_series_, skipna=False)) - - # check the result is correct - nona = string_series_.dropna() - assert_almost_equal(f(nona), alternate(nona.values)) - assert_almost_equal(f(string_series_), alternate(nona.values)) - - allna = string_series_ * nan - - if check_allna: - assert np.isnan(f(allna)) - - # dtype=object with None, it works! - s = Series([1, 2, 3, None, 5]) - f(s) - - # 2888 - items = [0] - items.extend(lrange(2 ** 40, 2 ** 40 + 1000)) - s = Series(items, dtype='int64') - assert_almost_equal(float(f(s)), float(alternate(s.values))) - - # check date range - if check_objects: - s = Series(bdate_range('1/1/2000', periods=10)) - res = f(s) - exp = alternate(s) - assert res == exp - - # check on string data - if name not in ['sum', 'min', 'max']: - pytest.raises(TypeError, f, Series(list('abc'))) - - # Invalid axis. - pytest.raises(ValueError, f, string_series_, axis=1) - - # Unimplemented numeric_only parameter. - if 'numeric_only' in compat.signature(f).args: - with pytest.raises(NotImplementedError, match=name): - f(string_series_, numeric_only=True) - def _check_accum_op(self, name, datetime_series_, check_dtype=True): func = getattr(np, name) tm.assert_numpy_array_equal(func(datetime_series_).values, @@ -649,39 +304,6 @@ def test_prod_numpy16_bug(self): assert not isinstance(result, Series) - def test_all_any(self): - ts = tm.makeTimeSeries() - bool_series = ts > 0 - assert not bool_series.all() - assert bool_series.any() - - # Alternative types, with implicit 'object' dtype. - s = Series(['abc', True]) - assert 'abc' == s.any() # 'abc' || True => 'abc' - - def test_all_any_params(self): - # Check skipna, with implicit 'object' dtype. - s1 = Series([np.nan, True]) - s2 = Series([np.nan, False]) - assert s1.all(skipna=False) # nan && True => True - assert s1.all(skipna=True) - assert np.isnan(s2.any(skipna=False)) # nan || False => nan - assert not s2.any(skipna=True) - - # Check level. - s = pd.Series([False, False, True, True, False, True], - index=[0, 0, 1, 1, 2, 2]) - assert_series_equal(s.all(level=0), Series([False, True, False])) - assert_series_equal(s.any(level=0), Series([False, True, True])) - - # bool_only is not implemented with level option. - pytest.raises(NotImplementedError, s.any, bool_only=True, level=0) - pytest.raises(NotImplementedError, s.all, bool_only=True, level=0) - - # bool_only is not implemented alone. - pytest.raises(NotImplementedError, s.any, bool_only=True) - pytest.raises(NotImplementedError, s.all, bool_only=True) - @td.skip_if_no_scipy def test_corr(self, datetime_series): import scipy.stats as stats @@ -1124,174 +746,6 @@ def test_isin_empty(self, empty): result = s.isin(empty) tm.assert_series_equal(expected, result) - def test_timedelta64_analytics(self): - from pandas import date_range - - # index min/max - td = Series(date_range('2012-1-1', periods=3, freq='D')) - \ - Timestamp('20120101') - - result = td.idxmin() - assert result == 0 - - result = td.idxmax() - assert result == 2 - - # GH 2982 - # with NaT - td[0] = np.nan - - result = td.idxmin() - assert result == 1 - - result = td.idxmax() - assert result == 2 - - # abs - s1 = Series(date_range('20120101', periods=3)) - s2 = Series(date_range('20120102', periods=3)) - expected = Series(s2 - s1) - - # this fails as numpy returns timedelta64[us] - # result = np.abs(s1-s2) - # assert_frame_equal(result,expected) - - result = (s1 - s2).abs() - assert_series_equal(result, expected) - - # max/min - result = td.max() - expected = Timedelta('2 days') - assert result == expected - - result = td.min() - expected = Timedelta('1 days') - assert result == expected - - def test_idxmin(self, string_series): - # test idxmin - # _check_stat_op approach can not be used here because of isna check. - - # add some NaNs - string_series[5:15] = np.NaN - - # skipna or no - assert string_series[string_series.idxmin()] == string_series.min() - assert isna(string_series.idxmin(skipna=False)) - - # no NaNs - nona = string_series.dropna() - assert nona[nona.idxmin()] == nona.min() - assert (nona.index.values.tolist().index(nona.idxmin()) == - nona.values.argmin()) - - # all NaNs - allna = string_series * nan - assert isna(allna.idxmin()) - - # datetime64[ns] - from pandas import date_range - s = Series(date_range('20130102', periods=6)) - result = s.idxmin() - assert result == 0 - - s[0] = np.nan - result = s.idxmin() - assert result == 1 - - def test_numpy_argmin_deprecated(self): - # See gh-16830 - data = np.arange(1, 11) - - s = Series(data, index=data) - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - # The deprecation of Series.argmin also causes a deprecation - # warning when calling np.argmin. This behavior is temporary - # until the implementation of Series.argmin is corrected. - result = np.argmin(s) - - assert result == 1 - - with tm.assert_produces_warning(FutureWarning): - # argmin is aliased to idxmin - result = s.argmin() - - assert result == 1 - - with tm.assert_produces_warning(FutureWarning, - check_stacklevel=False): - msg = "the 'out' parameter is not supported" - with pytest.raises(ValueError, match=msg): - np.argmin(s, out=data) - - def test_idxmax(self, string_series): - # test idxmax - # _check_stat_op approach can not be used here because of isna check. - - # add some NaNs - string_series[5:15] = np.NaN - - # skipna or no - assert string_series[string_series.idxmax()] == string_series.max() - assert isna(string_series.idxmax(skipna=False)) - - # no NaNs - nona = string_series.dropna() - assert nona[nona.idxmax()] == nona.max() - assert (nona.index.values.tolist().index(nona.idxmax()) == - nona.values.argmax()) - - # all NaNs - allna = string_series * nan - assert isna(allna.idxmax()) - - from pandas import date_range - s = Series(date_range('20130102', periods=6)) - result = s.idxmax() - assert result == 5 - - s[5] = np.nan - result = s.idxmax() - assert result == 4 - - # Float64Index - # GH 5914 - s = pd.Series([1, 2, 3], [1.1, 2.1, 3.1]) - result = s.idxmax() - assert result == 3.1 - result = s.idxmin() - assert result == 1.1 - - s = pd.Series(s.index, s.index) - result = s.idxmax() - assert result == 3.1 - result = s.idxmin() - assert result == 1.1 - - def test_numpy_argmax_deprecated(self): - # See gh-16830 - data = np.arange(1, 11) - - s = Series(data, index=data) - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - # The deprecation of Series.argmax also causes a deprecation - # warning when calling np.argmax. This behavior is temporary - # until the implementation of Series.argmax is corrected. - result = np.argmax(s) - assert result == 10 - - with tm.assert_produces_warning(FutureWarning): - # argmax is aliased to idxmax - result = s.argmax() - - assert result == 10 - - with tm.assert_produces_warning(FutureWarning, - check_stacklevel=False): - msg = "the 'out' parameter is not supported" - with pytest.raises(ValueError, match=msg): - np.argmax(s, out=data) - def test_ptp(self): # GH21614 N = 1000 @@ -1333,12 +787,6 @@ def test_ptp(self): check_stacklevel=False): s.ptp(numeric_only=True) - def test_empty_timeseries_redections_return_nat(self): - # covers #11245 - for dtype in ('m8[ns]', 'm8[ns]', 'M8[ns]', 'M8[ns, UTC]'): - assert Series([], dtype=dtype).min() is pd.NaT - assert Series([], dtype=dtype).max() is pd.NaT - def test_repeat(self): s = Series(np.random.randn(3), index=['a', 'b', 'c']) @@ -1735,180 +1183,6 @@ def s_main_dtypes_split(request, s_main_dtypes): return s_main_dtypes[request.param] -class TestMode(object): - - @pytest.mark.parametrize('dropna, expected', [ - (True, Series([], dtype=np.float64)), - (False, Series([], dtype=np.float64)) - ]) - def test_mode_empty(self, dropna, expected): - s = Series([], dtype=np.float64) - result = s.mode(dropna) - tm.assert_series_equal(result, expected) - - @pytest.mark.parametrize('dropna, data, expected', [ - (True, [1, 1, 1, 2], [1]), - (True, [1, 1, 1, 2, 3, 3, 3], [1, 3]), - (False, [1, 1, 1, 2], [1]), - (False, [1, 1, 1, 2, 3, 3, 3], [1, 3]), - ]) - @pytest.mark.parametrize( - 'dt', - list(np.typecodes['AllInteger'] + np.typecodes['Float']) - ) - def test_mode_numerical(self, dropna, data, expected, dt): - s = Series(data, dtype=dt) - result = s.mode(dropna) - expected = Series(expected, dtype=dt) - tm.assert_series_equal(result, expected) - - @pytest.mark.parametrize('dropna, expected', [ - (True, [1.0]), - (False, [1, np.nan]), - ]) - def test_mode_numerical_nan(self, dropna, expected): - s = Series([1, 1, 2, np.nan, np.nan]) - result = s.mode(dropna) - expected = Series(expected) - tm.assert_series_equal(result, expected) - - @pytest.mark.parametrize('dropna, expected1, expected2, expected3', [ - (True, ['b'], ['bar'], ['nan']), - (False, ['b'], [np.nan], ['nan']) - ]) - def test_mode_str_obj(self, dropna, expected1, expected2, expected3): - # Test string and object types. - data = ['a'] * 2 + ['b'] * 3 - - s = Series(data, dtype='c') - result = s.mode(dropna) - expected1 = Series(expected1, dtype='c') - tm.assert_series_equal(result, expected1) - - data = ['foo', 'bar', 'bar', np.nan, np.nan, np.nan] - - s = Series(data, dtype=object) - result = s.mode(dropna) - expected2 = Series(expected2, dtype=object) - tm.assert_series_equal(result, expected2) - - data = ['foo', 'bar', 'bar', np.nan, np.nan, np.nan] - - s = Series(data, dtype=object).astype(str) - result = s.mode(dropna) - expected3 = Series(expected3, dtype=str) - tm.assert_series_equal(result, expected3) - - @pytest.mark.parametrize('dropna, expected1, expected2', [ - (True, ['foo'], ['foo']), - (False, ['foo'], [np.nan]) - ]) - def test_mode_mixeddtype(self, dropna, expected1, expected2): - s = Series([1, 'foo', 'foo']) - result = s.mode(dropna) - expected = Series(expected1) - tm.assert_series_equal(result, expected) - - s = Series([1, 'foo', 'foo', np.nan, np.nan, np.nan]) - result = s.mode(dropna) - expected = Series(expected2, dtype=object) - tm.assert_series_equal(result, expected) - - @pytest.mark.parametrize('dropna, expected1, expected2', [ - (True, ['1900-05-03', '2011-01-03', '2013-01-02'], - ['2011-01-03', '2013-01-02']), - (False, [np.nan], [np.nan, '2011-01-03', '2013-01-02']), - ]) - def test_mode_datetime(self, dropna, expected1, expected2): - s = Series(['2011-01-03', '2013-01-02', - '1900-05-03', 'nan', 'nan'], dtype='M8[ns]') - result = s.mode(dropna) - expected1 = Series(expected1, dtype='M8[ns]') - tm.assert_series_equal(result, expected1) - - s = Series(['2011-01-03', '2013-01-02', '1900-05-03', - '2011-01-03', '2013-01-02', 'nan', 'nan'], - dtype='M8[ns]') - result = s.mode(dropna) - expected2 = Series(expected2, dtype='M8[ns]') - tm.assert_series_equal(result, expected2) - - @pytest.mark.parametrize('dropna, expected1, expected2', [ - (True, ['-1 days', '0 days', '1 days'], ['2 min', '1 day']), - (False, [np.nan], [np.nan, '2 min', '1 day']), - ]) - def test_mode_timedelta(self, dropna, expected1, expected2): - # gh-5986: Test timedelta types. - - s = Series(['1 days', '-1 days', '0 days', 'nan', 'nan'], - dtype='timedelta64[ns]') - result = s.mode(dropna) - expected1 = Series(expected1, dtype='timedelta64[ns]') - tm.assert_series_equal(result, expected1) - - s = Series(['1 day', '1 day', '-1 day', '-1 day 2 min', - '2 min', '2 min', 'nan', 'nan'], - dtype='timedelta64[ns]') - result = s.mode(dropna) - expected2 = Series(expected2, dtype='timedelta64[ns]') - tm.assert_series_equal(result, expected2) - - @pytest.mark.parametrize('dropna, expected1, expected2, expected3', [ - (True, Categorical([1, 2], categories=[1, 2]), - Categorical(['a'], categories=[1, 'a']), - Categorical([3, 1], categories=[3, 2, 1], ordered=True)), - (False, Categorical([np.nan], categories=[1, 2]), - Categorical([np.nan, 'a'], categories=[1, 'a']), - Categorical([np.nan, 3, 1], categories=[3, 2, 1], ordered=True)), - ]) - def test_mode_category(self, dropna, expected1, expected2, expected3): - s = Series(Categorical([1, 2, np.nan, np.nan])) - result = s.mode(dropna) - expected1 = Series(expected1, dtype='category') - tm.assert_series_equal(result, expected1) - - s = Series(Categorical([1, 'a', 'a', np.nan, np.nan])) - result = s.mode(dropna) - expected2 = Series(expected2, dtype='category') - tm.assert_series_equal(result, expected2) - - s = Series(Categorical([1, 1, 2, 3, 3, np.nan, np.nan], - categories=[3, 2, 1], ordered=True)) - result = s.mode(dropna) - expected3 = Series(expected3, dtype='category') - tm.assert_series_equal(result, expected3) - - @pytest.mark.parametrize('dropna, expected1, expected2', [ - (True, [2**63], [1, 2**63]), - (False, [2**63], [1, 2**63]) - ]) - def test_mode_intoverflow(self, dropna, expected1, expected2): - # Test for uint64 overflow. - s = Series([1, 2**63, 2**63], dtype=np.uint64) - result = s.mode(dropna) - expected1 = Series(expected1, dtype=np.uint64) - tm.assert_series_equal(result, expected1) - - s = Series([1, 2**63], dtype=np.uint64) - result = s.mode(dropna) - expected2 = Series(expected2, dtype=np.uint64) - tm.assert_series_equal(result, expected2) - - @pytest.mark.skipif(not compat.PY3, reason="only PY3") - def test_mode_sortwarning(self): - # Check for the warning that is raised when the mode - # results cannot be sorted - - expected = Series(['foo', np.nan]) - s = Series([1, 'foo', 'foo', np.nan, np.nan]) - - with tm.assert_produces_warning(UserWarning, check_stacklevel=False): - result = s.mode(dropna=False) - result = result.sort_values().reset_index(drop=True) - - tm.assert_series_equal(result, expected) - - def assert_check_nselect_boundary(vals, dtype, method): # helper function for 'test_boundary_{dtype}' tests s = Series(vals, dtype=dtype) @@ -2047,40 +1321,6 @@ def test_count(self): result = s.count() assert result == 2 - def test_min_max(self): - # unordered cats have no min/max - cat = Series(Categorical(["a", "b", "c", "d"], ordered=False)) - pytest.raises(TypeError, lambda: cat.min()) - pytest.raises(TypeError, lambda: cat.max()) - - cat = Series(Categorical(["a", "b", "c", "d"], ordered=True)) - _min = cat.min() - _max = cat.max() - assert _min == "a" - assert _max == "d" - - cat = Series(Categorical(["a", "b", "c", "d"], categories=[ - 'd', 'c', 'b', 'a'], ordered=True)) - _min = cat.min() - _max = cat.max() - assert _min == "d" - assert _max == "a" - - cat = Series(Categorical( - [np.nan, "b", "c", np.nan], categories=['d', 'c', 'b', 'a' - ], ordered=True)) - _min = cat.min() - _max = cat.max() - assert np.isnan(_min) - assert _max == "b" - - cat = Series(Categorical( - [np.nan, 1, 2, np.nan], categories=[5, 4, 3, 2, 1], ordered=True)) - _min = cat.min() - _max = cat.max() - assert np.isnan(_min) - assert _max == 1 - def test_value_counts(self): # GH 12835 cats = Categorical(list('abcccb'), categories=list('cabd')) diff --git a/pandas/tests/series/test_datetime_values.py b/pandas/tests/series/test_datetime_values.py index 0d617d5a26706..745a9eee6c300 100644 --- a/pandas/tests/series/test_datetime_values.py +++ b/pandas/tests/series/test_datetime_values.py @@ -531,24 +531,6 @@ def test_dt_timetz_accessor(self, tz_naive_fixture): result = s.dt.timetz tm.assert_series_equal(result, expected) - @pytest.mark.parametrize('nat', [ - pd.Series([pd.NaT, pd.NaT]), - pd.Series([pd.NaT, pd.Timedelta('nat')]), - pd.Series([pd.Timedelta('nat'), pd.Timedelta('nat')])]) - def test_minmax_nat_series(self, nat): - # GH 23282 - assert nat.min() is pd.NaT - assert nat.max() is pd.NaT - - @pytest.mark.parametrize('nat', [ - # GH 23282 - pd.DataFrame([pd.NaT, pd.NaT]), - pd.DataFrame([pd.NaT, pd.Timedelta('nat')]), - pd.DataFrame([pd.Timedelta('nat'), pd.Timedelta('nat')])]) - def test_minmax_nat_dataframe(self, nat): - assert nat.min()[0] is pd.NaT - assert nat.max()[0] is pd.NaT - def test_setitem_with_string_index(self): # GH 23451 x = pd.Series([1, 2, 3], index=['Date', 'b', 'other']) diff --git a/pandas/tests/series/test_operators.py b/pandas/tests/series/test_operators.py index 7c11880ae5f94..f6fb5f0c46cc8 100644 --- a/pandas/tests/series/test_operators.py +++ b/pandas/tests/series/test_operators.py @@ -750,53 +750,6 @@ def test_op_duplicate_index(self): expected = pd.Series([11, 12, np.nan], index=[1, 1, 2]) assert_series_equal(result, expected) - @pytest.mark.parametrize( - "test_input,error_type", - [ - (pd.Series([]), ValueError), - - # For strings, or any Series with dtype 'O' - (pd.Series(['foo', 'bar', 'baz']), TypeError), - (pd.Series([(1,), (2,)]), TypeError), - - # For mixed data types - ( - pd.Series(['foo', 'foo', 'bar', 'bar', None, np.nan, 'baz']), - TypeError - ), - ] - ) - def test_assert_idxminmax_raises(self, test_input, error_type): - """ - Cases where ``Series.argmax`` and related should raise an exception - """ - with pytest.raises(error_type): - test_input.idxmin() - with pytest.raises(error_type): - test_input.idxmin(skipna=False) - with pytest.raises(error_type): - test_input.idxmax() - with pytest.raises(error_type): - test_input.idxmax(skipna=False) - - def test_idxminmax_with_inf(self): - # For numeric data with NA and Inf (GH #13595) - s = pd.Series([0, -np.inf, np.inf, np.nan]) - - assert s.idxmin() == 1 - assert np.isnan(s.idxmin(skipna=False)) - - assert s.idxmax() == 2 - assert np.isnan(s.idxmax(skipna=False)) - - # Using old-style behavior that treats floating point nan, -inf, and - # +inf as missing - with pd.option_context('mode.use_inf_as_na', True): - assert s.idxmin() == 0 - assert np.isnan(s.idxmin(skipna=False)) - assert s.idxmax() == 0 - np.isnan(s.idxmax(skipna=False)) - class TestSeriesUnaryOps(object): # __neg__, __pos__, __inv__ diff --git a/pandas/tests/series/test_timeseries.py b/pandas/tests/series/test_timeseries.py index ce464184cd8d6..4f47c308c9a13 100644 --- a/pandas/tests/series/test_timeseries.py +++ b/pandas/tests/series/test_timeseries.py @@ -967,35 +967,6 @@ def test_setops_preserve_freq(self, tz): assert result.freq == rng.freq assert result.tz == rng.tz - def test_min_max(self): - rng = date_range('1/1/2000', '12/31/2000') - rng2 = rng.take(np.random.permutation(len(rng))) - - the_min = rng2.min() - the_max = rng2.max() - assert isinstance(the_min, Timestamp) - assert isinstance(the_max, Timestamp) - assert the_min == rng[0] - assert the_max == rng[-1] - - assert rng.min() == rng[0] - assert rng.max() == rng[-1] - - def test_min_max_series(self): - rng = date_range('1/1/2000', periods=10, freq='4h') - lvls = ['A', 'A', 'A', 'B', 'B', 'B', 'C', 'C', 'C', 'C'] - df = DataFrame({'TS': rng, 'V': np.random.randn(len(rng)), 'L': lvls}) - - result = df.TS.max() - exp = Timestamp(df.TS.iat[-1]) - assert isinstance(result, Timestamp) - assert result == exp - - result = df.TS.min() - exp = Timestamp(df.TS.iat[0]) - assert isinstance(result, Timestamp) - assert result == exp - def test_from_M8_structured(self): dates = [(datetime(2012, 9, 9, 0, 0), datetime(2012, 9, 8, 15, 10))] arr = np.array(dates, diff --git a/pandas/tests/test_base.py b/pandas/tests/test_base.py index 6eada0e89b506..91e1af5c8887c 100644 --- a/pandas/tests/test_base.py +++ b/pandas/tests/test_base.py @@ -15,7 +15,7 @@ import pandas.util.testing as tm from pandas import (Series, Index, DatetimeIndex, TimedeltaIndex, PeriodIndex, Timedelta, IntervalIndex, Interval, - CategoricalIndex, Timestamp) + CategoricalIndex, Timestamp, DataFrame, Panel) from pandas.compat import StringIO, PYPY, long from pandas.compat.numpy import np_array_datetime64_compat from pandas.core.accessor import PandasDelegate @@ -49,8 +49,8 @@ class CheckImmutable(object): def check_mutable_error(self, *args, **kwargs): # Pass whatever function you normally would to pytest.raises # (after the Exception kind). - pytest.raises( - TypeError, self.mutable_regex, *args, **kwargs) + with pytest.raises(TypeError): + self.mutable_regex(*args, **kwargs) def test_no_mutable_funcs(self): def setitem(): @@ -227,14 +227,15 @@ def check_ops_properties(self, props, filter=None, ignore_failures=False): # an object that is datetimelike will raise a TypeError, # otherwise an AttributeError + err = AttributeError if issubclass(type(o), DatetimeIndexOpsMixin): - pytest.raises(TypeError, lambda: getattr(o, op)) - else: - pytest.raises(AttributeError, - lambda: getattr(o, op)) + err = TypeError + + with pytest.raises(err): + getattr(o, op) - def test_binary_ops_docs(self): - from pandas import DataFrame, Panel + @pytest.mark.parametrize('klass', [Series, DataFrame, Panel]) + def test_binary_ops_docs(self, klass): op_map = {'add': '+', 'sub': '-', 'mul': '*', @@ -242,18 +243,16 @@ def test_binary_ops_docs(self): 'pow': '**', 'truediv': '/', 'floordiv': '//'} - for op_name in ['add', 'sub', 'mul', 'mod', 'pow', 'truediv', - 'floordiv']: - for klass in [Series, DataFrame, Panel]: - operand1 = klass.__name__.lower() - operand2 = 'other' - op = op_map[op_name] - expected_str = ' '.join([operand1, op, operand2]) - assert expected_str in getattr(klass, op_name).__doc__ + for op_name in op_map: + operand1 = klass.__name__.lower() + operand2 = 'other' + op = op_map[op_name] + expected_str = ' '.join([operand1, op, operand2]) + assert expected_str in getattr(klass, op_name).__doc__ - # reverse version of the binary ops - expected_str = ' '.join([operand2, op, operand1]) - assert expected_str in getattr(klass, 'r' + op_name).__doc__ + # reverse version of the binary ops + expected_str = ' '.join([operand2, op, operand1]) + assert expected_str in getattr(klass, 'r' + op_name).__doc__ class TestIndexOps(Ops): @@ -338,68 +337,6 @@ def test_ndarray_compat_properties(self): assert Index([1]).item() == 1 assert Series([1]).item() == 1 - def test_ops(self): - for op in ['max', 'min']: - for o in self.objs: - result = getattr(o, op)() - if not isinstance(o, PeriodIndex): - expected = getattr(o.values, op)() - else: - expected = pd.Period( - ordinal=getattr(o._ndarray_values, op)(), - freq=o.freq) - try: - assert result == expected - except TypeError: - # comparing tz-aware series with np.array results in - # TypeError - expected = expected.astype('M8[ns]').astype('int64') - assert result.value == expected - - def test_nanops(self): - # GH 7261 - for op in ['max', 'min']: - for klass in [Index, Series]: - - obj = klass([np.nan, 2.0]) - assert getattr(obj, op)() == 2.0 - - obj = klass([np.nan]) - assert pd.isna(getattr(obj, op)()) - - obj = klass([]) - assert pd.isna(getattr(obj, op)()) - - obj = klass([pd.NaT, datetime(2011, 11, 1)]) - # check DatetimeIndex monotonic path - assert getattr(obj, op)() == datetime(2011, 11, 1) - - obj = klass([pd.NaT, datetime(2011, 11, 1), pd.NaT]) - # check DatetimeIndex non-monotonic path - assert getattr(obj, op)(), datetime(2011, 11, 1) - - # argmin/max - obj = Index(np.arange(5, dtype='int64')) - assert obj.argmin() == 0 - assert obj.argmax() == 4 - - obj = Index([np.nan, 1, np.nan, 2]) - assert obj.argmin() == 1 - assert obj.argmax() == 3 - - obj = Index([np.nan]) - assert obj.argmin() == -1 - assert obj.argmax() == -1 - - obj = Index([pd.NaT, datetime(2011, 11, 1), datetime(2011, 11, 2), - pd.NaT]) - assert obj.argmin() == 1 - assert obj.argmax() == 2 - - obj = Index([pd.NaT]) - assert obj.argmin() == -1 - assert obj.argmax() == -1 - def test_value_counts_unique_nunique(self): for orig in self.objs: o = orig.copy() @@ -546,106 +483,105 @@ def test_value_counts_unique_nunique_null(self): assert o.nunique() == 8 assert o.nunique(dropna=False) == 9 - def test_value_counts_inferred(self): - klasses = [Index, Series] - for klass in klasses: - s_values = ['a', 'b', 'b', 'b', 'b', 'c', 'd', 'd', 'a', 'a'] - s = klass(s_values) - expected = Series([4, 3, 2, 1], index=['b', 'a', 'd', 'c']) - tm.assert_series_equal(s.value_counts(), expected) - - if isinstance(s, Index): - exp = Index(np.unique(np.array(s_values, dtype=np.object_))) - tm.assert_index_equal(s.unique(), exp) - else: - exp = np.unique(np.array(s_values, dtype=np.object_)) - tm.assert_numpy_array_equal(s.unique(), exp) - - assert s.nunique() == 4 - # don't sort, have to sort after the fact as not sorting is - # platform-dep - hist = s.value_counts(sort=False).sort_values() - expected = Series([3, 1, 4, 2], index=list('acbd')).sort_values() - tm.assert_series_equal(hist, expected) - - # sort ascending - hist = s.value_counts(ascending=True) - expected = Series([1, 2, 3, 4], index=list('cdab')) - tm.assert_series_equal(hist, expected) - - # relative histogram. - hist = s.value_counts(normalize=True) - expected = Series([.4, .3, .2, .1], index=['b', 'a', 'd', 'c']) - tm.assert_series_equal(hist, expected) - - def test_value_counts_bins(self): - klasses = [Index, Series] - for klass in klasses: - s_values = ['a', 'b', 'b', 'b', 'b', 'c', 'd', 'd', 'a', 'a'] - s = klass(s_values) - - # bins - pytest.raises(TypeError, lambda bins: s.value_counts(bins=bins), 1) - - s1 = Series([1, 1, 2, 3]) - res1 = s1.value_counts(bins=1) - exp1 = Series({Interval(0.997, 3.0): 4}) - tm.assert_series_equal(res1, exp1) - res1n = s1.value_counts(bins=1, normalize=True) - exp1n = Series({Interval(0.997, 3.0): 1.0}) - tm.assert_series_equal(res1n, exp1n) - - if isinstance(s1, Index): - tm.assert_index_equal(s1.unique(), Index([1, 2, 3])) - else: - exp = np.array([1, 2, 3], dtype=np.int64) - tm.assert_numpy_array_equal(s1.unique(), exp) - - assert s1.nunique() == 3 - - # these return the same - res4 = s1.value_counts(bins=4, dropna=True) - intervals = IntervalIndex.from_breaks([0.997, 1.5, 2.0, 2.5, 3.0]) - exp4 = Series([2, 1, 1, 0], index=intervals.take([0, 3, 1, 2])) - tm.assert_series_equal(res4, exp4) - - res4 = s1.value_counts(bins=4, dropna=False) - intervals = IntervalIndex.from_breaks([0.997, 1.5, 2.0, 2.5, 3.0]) - exp4 = Series([2, 1, 1, 0], index=intervals.take([0, 3, 1, 2])) - tm.assert_series_equal(res4, exp4) - - res4n = s1.value_counts(bins=4, normalize=True) - exp4n = Series([0.5, 0.25, 0.25, 0], - index=intervals.take([0, 3, 1, 2])) - tm.assert_series_equal(res4n, exp4n) - - # handle NA's properly - s_values = ['a', 'b', 'b', 'b', np.nan, np.nan, - 'd', 'd', 'a', 'a', 'b'] - s = klass(s_values) - expected = Series([4, 3, 2], index=['b', 'a', 'd']) - tm.assert_series_equal(s.value_counts(), expected) - - if isinstance(s, Index): - exp = Index(['a', 'b', np.nan, 'd']) - tm.assert_index_equal(s.unique(), exp) - else: - exp = np.array(['a', 'b', np.nan, 'd'], dtype=object) - tm.assert_numpy_array_equal(s.unique(), exp) - assert s.nunique() == 3 - - s = klass({}) - expected = Series([], dtype=np.int64) - tm.assert_series_equal(s.value_counts(), expected, - check_index_type=False) - # returned dtype differs depending on original - if isinstance(s, Index): - tm.assert_index_equal(s.unique(), Index([]), exact=False) - else: - tm.assert_numpy_array_equal(s.unique(), np.array([]), - check_dtype=False) + @pytest.mark.parametrize('klass', [Index, Series]) + def test_value_counts_inferred(self, klass): + s_values = ['a', 'b', 'b', 'b', 'b', 'c', 'd', 'd', 'a', 'a'] + s = klass(s_values) + expected = Series([4, 3, 2, 1], index=['b', 'a', 'd', 'c']) + tm.assert_series_equal(s.value_counts(), expected) + + if isinstance(s, Index): + exp = Index(np.unique(np.array(s_values, dtype=np.object_))) + tm.assert_index_equal(s.unique(), exp) + else: + exp = np.unique(np.array(s_values, dtype=np.object_)) + tm.assert_numpy_array_equal(s.unique(), exp) + + assert s.nunique() == 4 + # don't sort, have to sort after the fact as not sorting is + # platform-dep + hist = s.value_counts(sort=False).sort_values() + expected = Series([3, 1, 4, 2], index=list('acbd')).sort_values() + tm.assert_series_equal(hist, expected) + + # sort ascending + hist = s.value_counts(ascending=True) + expected = Series([1, 2, 3, 4], index=list('cdab')) + tm.assert_series_equal(hist, expected) + + # relative histogram. + hist = s.value_counts(normalize=True) + expected = Series([.4, .3, .2, .1], index=['b', 'a', 'd', 'c']) + tm.assert_series_equal(hist, expected) + + @pytest.mark.parametrize('klass', [Index, Series]) + def test_value_counts_bins(self, klass): + s_values = ['a', 'b', 'b', 'b', 'b', 'c', 'd', 'd', 'a', 'a'] + s = klass(s_values) + + # bins + with pytest.raises(TypeError): + s.value_counts(bins=1) + + s1 = Series([1, 1, 2, 3]) + res1 = s1.value_counts(bins=1) + exp1 = Series({Interval(0.997, 3.0): 4}) + tm.assert_series_equal(res1, exp1) + res1n = s1.value_counts(bins=1, normalize=True) + exp1n = Series({Interval(0.997, 3.0): 1.0}) + tm.assert_series_equal(res1n, exp1n) + + if isinstance(s1, Index): + tm.assert_index_equal(s1.unique(), Index([1, 2, 3])) + else: + exp = np.array([1, 2, 3], dtype=np.int64) + tm.assert_numpy_array_equal(s1.unique(), exp) + + assert s1.nunique() == 3 + + # these return the same + res4 = s1.value_counts(bins=4, dropna=True) + intervals = IntervalIndex.from_breaks([0.997, 1.5, 2.0, 2.5, 3.0]) + exp4 = Series([2, 1, 1, 0], index=intervals.take([0, 3, 1, 2])) + tm.assert_series_equal(res4, exp4) + + res4 = s1.value_counts(bins=4, dropna=False) + intervals = IntervalIndex.from_breaks([0.997, 1.5, 2.0, 2.5, 3.0]) + exp4 = Series([2, 1, 1, 0], index=intervals.take([0, 3, 1, 2])) + tm.assert_series_equal(res4, exp4) + + res4n = s1.value_counts(bins=4, normalize=True) + exp4n = Series([0.5, 0.25, 0.25, 0], + index=intervals.take([0, 3, 1, 2])) + tm.assert_series_equal(res4n, exp4n) + + # handle NA's properly + s_values = ['a', 'b', 'b', 'b', np.nan, np.nan, + 'd', 'd', 'a', 'a', 'b'] + s = klass(s_values) + expected = Series([4, 3, 2], index=['b', 'a', 'd']) + tm.assert_series_equal(s.value_counts(), expected) + + if isinstance(s, Index): + exp = Index(['a', 'b', np.nan, 'd']) + tm.assert_index_equal(s.unique(), exp) + else: + exp = np.array(['a', 'b', np.nan, 'd'], dtype=object) + tm.assert_numpy_array_equal(s.unique(), exp) + assert s.nunique() == 3 - assert s.nunique() == 0 + s = klass({}) + expected = Series([], dtype=np.int64) + tm.assert_series_equal(s.value_counts(), expected, + check_index_type=False) + # returned dtype differs depending on original + if isinstance(s, Index): + tm.assert_index_equal(s.unique(), Index([]), exact=False) + else: + tm.assert_numpy_array_equal(s.unique(), np.array([]), + check_dtype=False) + + assert s.nunique() == 0 @pytest.mark.parametrize('klass', [Index, Series]) def test_value_counts_datetime64(self, klass): @@ -1001,8 +937,10 @@ def test_getitem(self): assert i[-1] == i[9] - pytest.raises(IndexError, i.__getitem__, 20) - pytest.raises(IndexError, s.iloc.__getitem__, 20) + with pytest.raises(IndexError): + i[20] + with pytest.raises(IndexError): + s.iloc[20] @pytest.mark.parametrize('indexer_klass', [list, pd.Index]) @pytest.mark.parametrize('indexer', [[True] * 10, [False] * 10, @@ -1022,10 +960,7 @@ class TestTranspose(Ops): def test_transpose(self): for obj in self.objs: - if isinstance(obj, Index): - tm.assert_index_equal(obj.transpose(), obj) - else: - tm.assert_series_equal(obj.transpose(), obj) + tm.assert_equal(obj.transpose(), obj) def test_transpose_non_default_axes(self): for obj in self.objs: @@ -1036,10 +971,7 @@ def test_transpose_non_default_axes(self): def test_numpy_transpose(self): for obj in self.objs: - if isinstance(obj, Index): - tm.assert_index_equal(np.transpose(obj), obj) - else: - tm.assert_series_equal(np.transpose(obj), obj) + tm.assert_equal(np.transpose(obj), obj) with pytest.raises(ValueError, match=self.errmsg): np.transpose(obj, axes=1) From d7bf6f2cc870c6ba5e465307c5e8271cbaab8af7 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Sun, 23 Dec 2018 07:47:06 -0800 Subject: [PATCH 11/15] REF: Pieces broken off of #24024 (#24364) --- pandas/core/arrays/datetimelike.py | 23 +++++++++++--- pandas/core/arrays/datetimes.py | 26 +++++++++++++--- pandas/core/arrays/period.py | 12 +++---- pandas/core/arrays/timedeltas.py | 6 ++-- pandas/core/indexes/base.py | 4 ++- pandas/core/indexes/datetimelike.py | 9 ++++-- pandas/core/indexes/datetimes.py | 14 ++++----- pandas/core/indexes/timedeltas.py | 31 +++++++++---------- pandas/core/reshape/merge.py | 4 +-- pandas/core/tools/datetimes.py | 5 +-- pandas/tests/arrays/test_timedeltas.py | 11 +++++++ .../indexes/datetimes/test_construction.py | 6 ++-- pandas/tests/indexes/datetimes/test_tools.py | 13 ++++++++ 13 files changed, 112 insertions(+), 52 deletions(-) diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index 0463a30cd1135..f82004747f0d0 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -296,9 +296,22 @@ def __iter__(self): @property def asi8(self): + # type: () -> ndarray + """ + Integer representation of the values. + + Returns + ------- + ndarray + An ndarray with int64 dtype. + """ # do not cache or you'll create a memory leak return self._data.view('i8') + @property + def _ndarray_values(self): + return self._data + # ---------------------------------------------------------------- # Rendering Methods @@ -469,7 +482,7 @@ def _isnan(self): return (self.asi8 == iNaT) @property # NB: override with cache_readonly in immutable subclasses - def hasnans(self): + def _hasnans(self): """ return if I have any nans; enables various perf speedups """ @@ -493,7 +506,7 @@ def _maybe_mask_results(self, result, fill_value=iNaT, convert=None): This is an internal routine """ - if self.hasnans: + if self._hasnans: if convert: result = result.astype(convert) if fill_value is None: @@ -696,7 +709,7 @@ def _add_delta_tdi(self, other): new_values = checked_add_with_arr(self_i8, other_i8, arr_mask=self._isnan, b_mask=other._isnan) - if self.hasnans or other.hasnans: + if self._hasnans or other._hasnans: mask = (self._isnan) | (other._isnan) new_values[mask] = iNaT return new_values.view('i8') @@ -764,7 +777,7 @@ def _sub_period_array(self, other): b_mask=other._isnan) new_values = np.array([self.freq.base * x for x in new_values]) - if self.hasnans or other.hasnans: + if self._hasnans or other._hasnans: mask = (self._isnan) | (other._isnan) new_values[mask] = NaT return new_values @@ -1085,7 +1098,7 @@ def _evaluate_compare(self, other, op): elif lib.is_scalar(lib.item_from_zerodim(other)): # ndarray scalar other = [other.item()] - other = type(self)(other) + other = type(self)._from_sequence(other) # compare result = op(self.asi8, other.asi8) diff --git a/pandas/core/arrays/datetimes.py b/pandas/core/arrays/datetimes.py index 59e9fe49f650a..2d1330dd87152 100644 --- a/pandas/core/arrays/datetimes.py +++ b/pandas/core/arrays/datetimes.py @@ -115,7 +115,7 @@ def wrapper(self, other): else: if isinstance(other, list): try: - other = type(self)(other) + other = type(self)._from_sequence(other) except ValueError: other = np.array(other, dtype=np.object_) elif not isinstance(other, (np.ndarray, ABCIndexClass, ABCSeries, @@ -147,7 +147,7 @@ def wrapper(self, other): if o_mask.any(): result[o_mask] = nat_result - if self.hasnans: + if self._hasnans: result[self._isnan] = nat_result return result @@ -349,6 +349,19 @@ def _box_func(self): @property def dtype(self): + # type: () -> Union[np.dtype, DatetimeTZDtype] + """ + The dtype for the DatetimeArray. + + Returns + ------- + numpy.dtype or DatetimeTZDtype + If the values are tz-naive, then ``np.dtype('datetime64[ns]')`` + is returned. + + If the values are tz-aware, then the ``DatetimeTZDtype`` + is returned. + """ if self.tz is None: return _NS_DTYPE return DatetimeTZDtype('ns', self.tz) @@ -356,7 +369,12 @@ def dtype(self): @property def tz(self): """ - Return timezone. + Return timezone, if any. + + Returns + ------- + datetime.tzinfo, pytz.tzinfo.BaseTZInfo, dateutil.tz.tz.tzfile, or None + Returns None when the array is tz-naive. """ # GH 18595 return self._tz @@ -534,7 +552,7 @@ def _sub_datetime_arraylike(self, other): other_i8 = other.asi8 new_values = checked_add_with_arr(self_i8, -other_i8, arr_mask=self._isnan) - if self.hasnans or other.hasnans: + if self._hasnans or other._hasnans: mask = (self._isnan) | (other._isnan) new_values[mask] = iNaT return new_values.view('timedelta64[ns]') diff --git a/pandas/core/arrays/period.py b/pandas/core/arrays/period.py index 6fd98bb25380a..16951275707cc 100644 --- a/pandas/core/arrays/period.py +++ b/pandas/core/arrays/period.py @@ -84,7 +84,7 @@ def wrapper(self, other): other = Period(other, freq=self.freq) result = op(other.ordinal) - if self.hasnans: + if self._hasnans: result[self._isnan] = nat_result return result @@ -499,7 +499,7 @@ def _time_shift(self, n, freq=None): "{cls}._time_shift" .format(cls=type(self).__name__)) values = self.asi8 + n * self.freq.n - if self.hasnans: + if self._hasnans: values[self._isnan] = iNaT return type(self)(values, freq=self.freq) @@ -561,7 +561,7 @@ def asfreq(self, freq=None, how='E'): new_data = period_asfreq_arr(ordinal, base1, base2, end) - if self.hasnans: + if self._hasnans: new_data[self._isnan] = iNaT return type(self)(new_data, freq=freq) @@ -581,7 +581,7 @@ def _format_native_types(self, na_rep=u'NaT', date_format=None, **kwargs): else: formatter = lambda dt: u'%s' % dt - if self.hasnans: + if self._hasnans: mask = self._isnan values[mask] = na_rep imask = ~mask @@ -668,7 +668,7 @@ def _sub_period(self, other): new_data = asi8 - other.ordinal new_data = np.array([self.freq * x for x in new_data]) - if self.hasnans: + if self._hasnans: new_data[self._isnan] = NaT return new_data @@ -983,7 +983,7 @@ def dt64arr_to_periodarr(data, freq, tz=None): """ if data.dtype != np.dtype('M8[ns]'): - raise ValueError('Wrong dtype: %s' % data.dtype) + raise ValueError('Wrong dtype: {dtype}'.format(dtype=data.dtype)) if freq is None: if isinstance(data, ABCIndexClass): diff --git a/pandas/core/arrays/timedeltas.py b/pandas/core/arrays/timedeltas.py index 314a3948f1032..06a9627a290c6 100644 --- a/pandas/core/arrays/timedeltas.py +++ b/pandas/core/arrays/timedeltas.py @@ -54,7 +54,7 @@ def _field_accessor(name, alias, docstring=None): def f(self): values = self.asi8 result = get_timedelta_field(values, alias) - if self.hasnans: + if self._hasnans: result = self._maybe_mask_results(result, fill_value=None, convert='float64') @@ -102,7 +102,7 @@ def wrapper(self, other): if o_mask.any(): result[o_mask] = nat_result - if self.hasnans: + if self._hasnans: result[self._isnan] = nat_result return result @@ -714,7 +714,7 @@ def components(self): columns = ['days', 'hours', 'minutes', 'seconds', 'milliseconds', 'microseconds', 'nanoseconds'] - hasnans = self.hasnans + hasnans = self._hasnans if hasnans: def f(x): if isna(x): diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index a2cf88fa9cb1a..478902fe53e58 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -682,7 +682,7 @@ def __array__(self, dtype=None): """ The array interface, return my values. """ - return self._data.view(np.ndarray) + return np.asarray(self._data, dtype=dtype) def __array_wrap__(self, result, context=None): """ @@ -739,6 +739,8 @@ def view(self, cls=None): Parameters ---------- dtype : numpy dtype or pandas type + Note that any integer `dtype` is treated as ``'int64'``, + regardless of the sign and size. copy : bool, default True By default, astype always returns a newly allocated object. If copy is set to False and internal requirements on dtype are diff --git a/pandas/core/indexes/datetimelike.py b/pandas/core/indexes/datetimelike.py index db0cb88b06b2b..86fa7f785914f 100644 --- a/pandas/core/indexes/datetimelike.py +++ b/pandas/core/indexes/datetimelike.py @@ -40,17 +40,22 @@ class DatetimeIndexOpsMixin(DatetimeLikeArrayMixin): # override DatetimeLikeArrayMixin method copy = Index.copy unique = Index.unique - take = Index.take # DatetimeLikeArrayMixin assumes subclasses are mutable, so these are # properties there. They can be made into cache_readonly for Index # subclasses bc they are immutable inferred_freq = cache_readonly(DatetimeLikeArrayMixin.inferred_freq.fget) _isnan = cache_readonly(DatetimeLikeArrayMixin._isnan.fget) - hasnans = cache_readonly(DatetimeLikeArrayMixin.hasnans.fget) + hasnans = cache_readonly(DatetimeLikeArrayMixin._hasnans.fget) + _hasnans = hasnans # for index / array -agnostic code _resolution = cache_readonly(DatetimeLikeArrayMixin._resolution.fget) resolution = cache_readonly(DatetimeLikeArrayMixin.resolution.fget) + # A few methods that are shared + _maybe_mask_results = DatetimeLikeArrayMixin._maybe_mask_results + + # ------------------------------------------------------------------------ + def equals(self, other): """ Determines if two Index objects contain the same elements. diff --git a/pandas/core/indexes/datetimes.py b/pandas/core/indexes/datetimes.py index 0e4132524045c..09e741af363da 100644 --- a/pandas/core/indexes/datetimes.py +++ b/pandas/core/indexes/datetimes.py @@ -227,11 +227,11 @@ def __new__(cls, data=None, "endpoints is deprecated. Use " "`pandas.date_range` instead.", FutureWarning, stacklevel=2) - result = cls._generate_range(start, end, periods, - freq=freq, tz=tz, normalize=normalize, - closed=closed, ambiguous=ambiguous) - result.name = name - return result + dtarr = DatetimeArray._generate_range( + start, end, periods, + freq=freq, tz=tz, normalize=normalize, + closed=closed, ambiguous=ambiguous) + return cls(dtarr, name=name) if is_scalar(data): raise TypeError("{cls}() must be called with a " @@ -1473,12 +1473,12 @@ def date_range(start=None, end=None, periods=None, freq=None, tz=None, if freq is None and com._any_none(periods, start, end): freq = 'D' - result = DatetimeIndex._generate_range( + dtarr = DatetimeArray._generate_range( start=start, end=end, periods=periods, freq=freq, tz=tz, normalize=normalize, closed=closed, **kwargs) - result.name = name + result = DatetimeIndex(dtarr, name=name) return result diff --git a/pandas/core/indexes/timedeltas.py b/pandas/core/indexes/timedeltas.py index e6c714683979f..47f7f7cf860fc 100644 --- a/pandas/core/indexes/timedeltas.py +++ b/pandas/core/indexes/timedeltas.py @@ -125,15 +125,6 @@ def _join_i8_wrapper(joinf, **kwargs): _left_indexer_unique = _join_i8_wrapper( libjoin.left_join_indexer_unique_int64, with_indexers=False) - # define my properties & methods for delegation - _other_ops = [] - _bool_ops = [] - _object_ops = ['freq'] - _field_ops = ['days', 'seconds', 'microseconds', 'nanoseconds'] - _datetimelike_ops = _field_ops + _object_ops + _bool_ops - _datetimelike_methods = ["to_pytimedelta", "total_seconds", - "round", "floor", "ceil"] - _engine_type = libindex.TimedeltaEngine _comparables = ['name', 'freq'] @@ -143,6 +134,14 @@ def _join_i8_wrapper(joinf, **kwargs): _freq = None + _box_func = TimedeltaArray._box_func + _bool_ops = TimedeltaArray._bool_ops + _object_ops = TimedeltaArray._object_ops + _field_ops = TimedeltaArray._field_ops + _datetimelike_ops = TimedeltaArray._datetimelike_ops + _datetimelike_methods = TimedeltaArray._datetimelike_methods + _other_ops = TimedeltaArray._other_ops + # ------------------------------------------------------------------- # Constructors @@ -163,10 +162,9 @@ def __new__(cls, data=None, unit=None, freq=None, start=None, end=None, "endpoints is deprecated. Use " "`pandas.timedelta_range` instead.", FutureWarning, stacklevel=2) - result = cls._generate_range(start, end, periods, freq, - closed=closed) - result.name = name - return result + tdarr = TimedeltaArray._generate_range(start, end, periods, freq, + closed=closed) + return cls(tdarr, name=name) if is_scalar(data): raise TypeError('{cls}() must be called with a ' @@ -766,7 +764,6 @@ def timedelta_range(start=None, end=None, periods=None, freq=None, freq = 'D' freq, freq_infer = dtl.maybe_infer_freq(freq) - result = TimedeltaIndex._generate_range(start, end, periods, freq, - closed=closed) - result.name = name - return result + tdarr = TimedeltaArray._generate_range(start, end, periods, freq, + closed=closed) + return TimedeltaIndex(tdarr, name=name) diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index 0adeb7997a888..58344c0ec9ec7 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -1592,8 +1592,8 @@ def _right_outer_join(x, y, max_groups): def _factorize_keys(lk, rk, sort=True): # Some pre-processing for non-ndarray lk / rk if is_datetime64tz_dtype(lk) and is_datetime64tz_dtype(rk): - lk = lk.values - rk = rk.values + lk = lk._data + rk = rk._data elif (is_categorical_dtype(lk) and is_categorical_dtype(rk) and diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py index 4fca5216e24f3..45d2615a3d055 100644 --- a/pandas/core/tools/datetimes.py +++ b/pandas/core/tools/datetimes.py @@ -171,6 +171,7 @@ def _convert_listlike_datetimes(arg, box, format, name=None, tz=None, - ndarray of Timestamps if box=False """ from pandas import DatetimeIndex + from pandas.core.arrays import DatetimeArrayMixin as DatetimeArray from pandas.core.arrays.datetimes import ( maybe_convert_dtype, objects_to_datetime64ns) @@ -179,14 +180,14 @@ def _convert_listlike_datetimes(arg, box, format, name=None, tz=None, # these are shortcutable if is_datetime64tz_dtype(arg): - if not isinstance(arg, DatetimeIndex): + if not isinstance(arg, (DatetimeArray, DatetimeIndex)): return DatetimeIndex(arg, tz=tz, name=name) if tz == 'utc': arg = arg.tz_convert(None).tz_localize(tz) return arg elif is_datetime64_ns_dtype(arg): - if box and not isinstance(arg, DatetimeIndex): + if box and not isinstance(arg, (DatetimeArray, DatetimeIndex)): try: return DatetimeIndex(arg, tz=tz, name=name) except ValueError: diff --git a/pandas/tests/arrays/test_timedeltas.py b/pandas/tests/arrays/test_timedeltas.py index aef30c1bb7744..97ac3fce07088 100644 --- a/pandas/tests/arrays/test_timedeltas.py +++ b/pandas/tests/arrays/test_timedeltas.py @@ -8,6 +8,17 @@ import pandas.util.testing as tm +class TestTimedeltaArrayConstructor(object): + def test_copy(self): + data = np.array([1, 2, 3], dtype='m8[ns]') + arr = TimedeltaArray(data, copy=False) + assert arr._data is data + + arr = TimedeltaArray(data, copy=True) + assert arr._data is not data + assert arr._data.base is not data + + class TestTimedeltaArray(object): def test_from_sequence_dtype(self): msg = r"Only timedelta64\[ns\] dtype is valid" diff --git a/pandas/tests/indexes/datetimes/test_construction.py b/pandas/tests/indexes/datetimes/test_construction.py index 88c322ff7c9ff..bca99d27bda56 100644 --- a/pandas/tests/indexes/datetimes/test_construction.py +++ b/pandas/tests/indexes/datetimes/test_construction.py @@ -7,8 +7,7 @@ import pytest import pytz -from pandas._libs.tslib import OutOfBoundsDatetime -from pandas._libs.tslibs import conversion +from pandas._libs.tslibs import OutOfBoundsDatetime, conversion import pandas as pd from pandas import ( @@ -21,7 +20,8 @@ class TestDatetimeIndex(object): - @pytest.mark.parametrize('dt_cls', [DatetimeIndex, DatetimeArray]) + @pytest.mark.parametrize('dt_cls', [DatetimeIndex, + DatetimeArray._from_sequence]) def test_freq_validation_with_nat(self, dt_cls): # GH#11587 make sure we get a useful error message when generate_range # raises diff --git a/pandas/tests/indexes/datetimes/test_tools.py b/pandas/tests/indexes/datetimes/test_tools.py index c24c1025ea63c..13f9648d46216 100644 --- a/pandas/tests/indexes/datetimes/test_tools.py +++ b/pandas/tests/indexes/datetimes/test_tools.py @@ -24,6 +24,7 @@ from pandas import ( DataFrame, DatetimeIndex, Index, NaT, Series, Timestamp, compat, date_range, isna, to_datetime) +from pandas.core.arrays import DatetimeArrayMixin as DatetimeArray from pandas.core.tools import datetimes as tools from pandas.util import testing as tm from pandas.util.testing import assert_series_equal @@ -246,6 +247,18 @@ def test_to_datetime_parse_timezone_keeps_name(self): class TestToDatetime(object): + @pytest.mark.parametrize('tz', [None, 'US/Central']) + def test_to_datetime_dtarr(self, tz): + # DatetimeArray + dti = date_range('1965-04-03', periods=19, freq='2W', tz=tz) + arr = DatetimeArray(dti) + + result = to_datetime(arr) + assert result is arr + + result = to_datetime(arr, box=True) + assert result is arr + def test_to_datetime_pydatetime(self): actual = pd.to_datetime(datetime(2008, 1, 15)) assert actual == datetime(2008, 1, 15) From 0c593ae9b6c6b843026809ced54251014688c123 Mon Sep 17 00:00:00 2001 From: Justin Zheng Date: Sun, 23 Dec 2018 09:31:01 -0800 Subject: [PATCH 12/15] ENH GH11978 access pd.plotting._misc from plot accessor (#23811) --- doc/source/whatsnew/v0.24.0.rst | 1 + pandas/plotting/_core.py | 23 +++++++++++++++++++++++ pandas/tests/plotting/test_frame.py | 16 ++++++++++++++++ pandas/tests/plotting/test_series.py | 13 +++++++++++++ 4 files changed, 53 insertions(+) diff --git a/doc/source/whatsnew/v0.24.0.rst b/doc/source/whatsnew/v0.24.0.rst index 724cfddb1b94c..a2abda019812a 100644 --- a/doc/source/whatsnew/v0.24.0.rst +++ b/doc/source/whatsnew/v0.24.0.rst @@ -371,6 +371,7 @@ Other Enhancements - :meth:`MultiIndex.to_flat_index` has been added to flatten multiple levels into a single-level :class:`Index` object. - :meth:`DataFrame.to_stata` and :class:`pandas.io.stata.StataWriter117` can write mixed sting columns to Stata strl format (:issue:`23633`) - :meth:`DataFrame.between_time` and :meth:`DataFrame.at_time` have gained the an ``axis`` parameter (:issue:`8839`) +- The ``scatter_matrix``, ``andrews_curves``, ``parallel_coordinates``, ``lag_plot``, ``autocorrelation_plot``, ``bootstrap_plot``, and ``radviz`` plots from the ``pandas.plotting`` module are now accessible from calling :meth:`DataFrame.plot` (:issue:`11978`) - :class:`IntervalIndex` has gained the :attr:`~IntervalIndex.is_overlapping` attribute to indicate if the ``IntervalIndex`` contains any overlapping intervals (:issue:`23309`) .. _whatsnew_0240.api_breaking: diff --git a/pandas/plotting/_core.py b/pandas/plotting/_core.py index 52d40f84cb1d9..c55952085f8c5 100644 --- a/pandas/plotting/_core.py +++ b/pandas/plotting/_core.py @@ -26,6 +26,7 @@ from pandas.core.generic import _shared_doc_kwargs, _shared_docs from pandas.io.formats.printing import pprint_thing +from pandas.plotting import _misc as misc from pandas.plotting._compat import _mpl_ge_3_0_0 from pandas.plotting._style import _get_standard_colors, plot_params from pandas.plotting._tools import ( @@ -2903,6 +2904,15 @@ def pie(self, **kwds): """ return self(kind='pie', **kwds) + def lag(self, *args, **kwds): + return misc.lag_plot(self._parent, *args, **kwds) + + def autocorrelation(self, *args, **kwds): + return misc.autocorrelation_plot(self._parent, *args, **kwds) + + def bootstrap(self, *args, **kwds): + return misc.bootstrap_plot(self._parent, *args, **kwds) + class FramePlotMethods(BasePlotMethods): """DataFrame plotting accessor and method @@ -3598,3 +3608,16 @@ def hexbin(self, x, y, C=None, reduce_C_function=None, gridsize=None, if gridsize is not None: kwds['gridsize'] = gridsize return self(kind='hexbin', x=x, y=y, C=C, **kwds) + + def scatter_matrix(self, *args, **kwds): + return misc.scatter_matrix(self._parent, *args, **kwds) + + def andrews_curves(self, class_column, *args, **kwds): + return misc.andrews_curves(self._parent, class_column, *args, **kwds) + + def parallel_coordinates(self, class_column, *args, **kwds): + return misc.parallel_coordinates(self._parent, class_column, + *args, **kwds) + + def radviz(self, class_column, *args, **kwds): + return misc.radviz(self._parent, class_column, *args, **kwds) diff --git a/pandas/tests/plotting/test_frame.py b/pandas/tests/plotting/test_frame.py index 4e047cd44c1e2..350d1bb153274 100644 --- a/pandas/tests/plotting/test_frame.py +++ b/pandas/tests/plotting/test_frame.py @@ -2988,6 +2988,22 @@ def test_secondary_axis_font_size(self, method): self._check_ticks_props(axes=ax.right_ax, ylabelsize=fontsize) + def test_misc_bindings(self, mock): + df = pd.DataFrame(randn(10, 10), columns=list('abcdefghij')) + p1 = mock.patch('pandas.plotting._misc.scatter_matrix', + return_value=2) + p2 = mock.patch('pandas.plotting._misc.andrews_curves', + return_value=2) + p3 = mock.patch('pandas.plotting._misc.parallel_coordinates', + return_value=2) + p4 = mock.patch('pandas.plotting._misc.radviz', + return_value=2) + with p1, p2, p3, p4: + assert df.plot.scatter_matrix() == 2 + assert df.plot.andrews_curves('a') == 2 + assert df.plot.parallel_coordinates('a') == 2 + assert df.plot.radviz('a') == 2 + def _generate_4_axes_via_gridspec(): import matplotlib.pyplot as plt diff --git a/pandas/tests/plotting/test_series.py b/pandas/tests/plotting/test_series.py index cc8aa2018b1a0..b5c69bb9e6443 100644 --- a/pandas/tests/plotting/test_series.py +++ b/pandas/tests/plotting/test_series.py @@ -877,3 +877,16 @@ def test_custom_business_day_freq(self): freq=CustomBusinessDay(holidays=['2014-05-26']))) _check_plot_works(s.plot) + + def test_misc_bindings(self, mock): + s = Series(randn(10)) + p1 = mock.patch('pandas.plotting._misc.lag_plot', + return_value=2) + p2 = mock.patch('pandas.plotting._misc.autocorrelation_plot', + return_value=2) + p3 = mock.patch('pandas.plotting._misc.bootstrap_plot', + return_value=2) + with p1, p2, p3: + assert s.plot.lag() == 2 + assert s.plot.autocorrelation() == 2 + assert s.plot.bootstrap() == 2 From 1cd077a3077352d63c1226d554db1e46cb9d2c1a Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Sun, 23 Dec 2018 09:32:49 -0800 Subject: [PATCH 13/15] BUG: Fix overflow bugs in date_Range (#24255) --- pandas/core/arrays/_ranges.py | 188 ++++++++++++++++++ pandas/core/arrays/datetimes.py | 107 +--------- .../indexes/datetimes/test_date_range.py | 61 ++++++ 3 files changed, 253 insertions(+), 103 deletions(-) create mode 100644 pandas/core/arrays/_ranges.py diff --git a/pandas/core/arrays/_ranges.py b/pandas/core/arrays/_ranges.py new file mode 100644 index 0000000000000..66c1b8e158672 --- /dev/null +++ b/pandas/core/arrays/_ranges.py @@ -0,0 +1,188 @@ +# -*- coding: utf-8 -*- +""" +Helper functions to generate range-like data for DatetimeArray +(and possibly TimedeltaArray/PeriodArray) +""" + +import numpy as np + +from pandas._libs.tslibs import OutOfBoundsDatetime, Timestamp + +from pandas.tseries.offsets import Tick, generate_range + + +def generate_regular_range(start, end, periods, freq): + """ + Generate a range of dates with the spans between dates described by + the given `freq` DateOffset. + + Parameters + ---------- + start : Timestamp or None + first point of produced date range + end : Timestamp or None + last point of produced date range + periods : int + number of periods in produced date range + freq : DateOffset + describes space between dates in produced date range + + Returns + ------- + ndarray[np.int64] representing nanosecond unix timestamps + """ + if isinstance(freq, Tick): + stride = freq.nanos + if periods is None: + b = Timestamp(start).value + # cannot just use e = Timestamp(end) + 1 because arange breaks when + # stride is too large, see GH10887 + e = (b + (Timestamp(end).value - b) // stride * stride + + stride // 2 + 1) + # end.tz == start.tz by this point due to _generate implementation + tz = start.tz + elif start is not None: + b = Timestamp(start).value + e = _generate_range_overflow_safe(b, periods, stride, side='start') + tz = start.tz + elif end is not None: + e = Timestamp(end).value + stride + b = _generate_range_overflow_safe(e, periods, stride, side='end') + tz = end.tz + else: + raise ValueError("at least 'start' or 'end' should be specified " + "if a 'period' is given.") + + with np.errstate(over="raise"): + # If the range is sufficiently large, np.arange may overflow + # and incorrectly return an empty array if not caught. + try: + values = np.arange(b, e, stride, dtype=np.int64) + except FloatingPointError: + xdr = [b] + while xdr[-1] != e: + xdr.append(xdr[-1] + stride) + values = np.array(xdr[:-1], dtype=np.int64) + + else: + tz = None + # start and end should have the same timezone by this point + if start is not None: + tz = start.tz + elif end is not None: + tz = end.tz + + xdr = generate_range(start=start, end=end, + periods=periods, offset=freq) + + values = np.array([x.value for x in xdr], dtype=np.int64) + + return values, tz + + +def _generate_range_overflow_safe(endpoint, periods, stride, side='start'): + """ + Calculate the second endpoint for passing to np.arange, checking + to avoid an integer overflow. Catch OverflowError and re-raise + as OutOfBoundsDatetime. + + Parameters + ---------- + endpoint : int + nanosecond timestamp of the known endpoint of the desired range + periods : int + number of periods in the desired range + stride : int + nanoseconds between periods in the desired range + side : {'start', 'end'} + which end of the range `endpoint` refers to + + Returns + ------- + other_end : int + + Raises + ------ + OutOfBoundsDatetime + """ + # GH#14187 raise instead of incorrectly wrapping around + assert side in ['start', 'end'] + + i64max = np.uint64(np.iinfo(np.int64).max) + msg = ('Cannot generate range with {side}={endpoint} and ' + 'periods={periods}' + .format(side=side, endpoint=endpoint, periods=periods)) + + with np.errstate(over="raise"): + # if periods * strides cannot be multiplied within the *uint64* bounds, + # we cannot salvage the operation by recursing, so raise + try: + addend = np.uint64(periods) * np.uint64(np.abs(stride)) + except FloatingPointError: + raise OutOfBoundsDatetime(msg) + + if np.abs(addend) <= i64max: + # relatively easy case without casting concerns + return _generate_range_overflow_safe_signed( + endpoint, periods, stride, side) + + elif ((endpoint > 0 and side == 'start' and stride > 0) or + (endpoint < 0 and side == 'end' and stride > 0)): + # no chance of not-overflowing + raise OutOfBoundsDatetime(msg) + + elif (side == 'end' and endpoint > i64max and endpoint - stride <= i64max): + # in _generate_regular_range we added `stride` thereby overflowing + # the bounds. Adjust to fix this. + return _generate_range_overflow_safe(endpoint - stride, + periods - 1, stride, side) + + # split into smaller pieces + mid_periods = periods // 2 + remaining = periods - mid_periods + assert 0 < remaining < periods, (remaining, periods, endpoint, stride) + + midpoint = _generate_range_overflow_safe(endpoint, mid_periods, + stride, side) + return _generate_range_overflow_safe(midpoint, remaining, stride, side) + + +def _generate_range_overflow_safe_signed(endpoint, periods, stride, side): + """ + A special case for _generate_range_overflow_safe where `periods * stride` + can be calculated without overflowing int64 bounds. + """ + assert side in ['start', 'end'] + if side == 'end': + stride *= -1 + + with np.errstate(over="raise"): + addend = np.int64(periods) * np.int64(stride) + try: + # easy case with no overflows + return np.int64(endpoint) + addend + except (FloatingPointError, OverflowError): + # with endpoint negative and addend positive we risk + # FloatingPointError; with reversed signed we risk OverflowError + pass + + # if stride and endpoint had opposite signs, then endpoint + addend + # should never overflow. so they must have the same signs + assert (stride > 0 and endpoint >= 0) or (stride < 0 and endpoint <= 0) + + if stride > 0: + # watch out for very special case in which we just slightly + # exceed implementation bounds, but when passing the result to + # np.arange will get a result slightly within the bounds + assert endpoint >= 0 + result = np.uint64(endpoint) + np.uint64(addend) + i64max = np.uint64(np.iinfo(np.int64).max) + assert result > i64max + if result <= i64max + np.uint64(stride): + return result + + raise OutOfBoundsDatetime('Cannot generate range with ' + '{side}={endpoint} and ' + 'periods={periods}' + .format(side=side, endpoint=endpoint, + periods=periods)) diff --git a/pandas/core/arrays/datetimes.py b/pandas/core/arrays/datetimes.py index 2d1330dd87152..a933f41faab67 100644 --- a/pandas/core/arrays/datetimes.py +++ b/pandas/core/arrays/datetimes.py @@ -24,10 +24,11 @@ from pandas.core import ops from pandas.core.algorithms import checked_add_with_arr from pandas.core.arrays import datetimelike as dtl +from pandas.core.arrays._ranges import generate_regular_range import pandas.core.common as com from pandas.tseries.frequencies import get_period_alias, to_offset -from pandas.tseries.offsets import Day, Tick, generate_range +from pandas.tseries.offsets import Day, Tick _midnight = time(0, 0) @@ -306,7 +307,8 @@ def _generate_range(cls, start, end, periods, freq, tz=None, if end is not None: end = end.tz_localize(None) # TODO: consider re-implementing _cached_range; GH#17914 - index = _generate_regular_range(cls, start, end, periods, freq) + values, _tz = generate_regular_range(start, end, periods, freq) + index = cls._simple_new(values, freq=freq, tz=_tz) if tz is not None and index.tz is None: arr = conversion.tz_localize_to_utc( @@ -1715,107 +1717,6 @@ def maybe_convert_dtype(data, copy): return data, copy -def _generate_regular_range(cls, start, end, periods, freq): - """ - Generate a range of dates with the spans between dates described by - the given `freq` DateOffset. - - Parameters - ---------- - cls : class - start : Timestamp or None - first point of produced date range - end : Timestamp or None - last point of produced date range - periods : int - number of periods in produced date range - freq : DateOffset - describes space between dates in produced date range - - Returns - ------- - ndarray[np.int64] representing nanosecond unix timestamps - - """ - if isinstance(freq, Tick): - stride = freq.nanos - if periods is None: - b = Timestamp(start).value - # cannot just use e = Timestamp(end) + 1 because arange breaks when - # stride is too large, see GH10887 - e = (b + (Timestamp(end).value - b) // stride * stride + - stride // 2 + 1) - # end.tz == start.tz by this point due to _generate implementation - tz = start.tz - elif start is not None: - b = Timestamp(start).value - e = _generate_range_overflow_safe(b, periods, stride, side='start') - tz = start.tz - elif end is not None: - e = Timestamp(end).value + stride - b = _generate_range_overflow_safe(e, periods, stride, side='end') - tz = end.tz - else: - raise ValueError("at least 'start' or 'end' should be specified " - "if a 'period' is given.") - - values = np.arange(b, e, stride, dtype=np.int64) - - else: - tz = None - # start and end should have the same timezone by this point - if start is not None: - tz = start.tz - elif end is not None: - tz = end.tz - - xdr = generate_range(start=start, end=end, - periods=periods, offset=freq) - - values = np.array([x.value for x in xdr], dtype=np.int64) - - data = cls._simple_new(values, freq=freq, tz=tz) - return data - - -def _generate_range_overflow_safe(endpoint, periods, stride, side='start'): - """ - Calculate the second endpoint for passing to np.arange, checking - to avoid an integer overflow. Catch OverflowError and re-raise - as OutOfBoundsDatetime. - - Parameters - ---------- - endpoint : int - periods : int - stride : int - side : {'start', 'end'} - - Returns - ------- - other_end : int - - Raises - ------ - OutOfBoundsDatetime - """ - # GH#14187 raise instead of incorrectly wrapping around - assert side in ['start', 'end'] - if side == 'end': - stride *= -1 - - try: - other_end = checked_add_with_arr(np.int64(endpoint), - np.int64(periods) * stride) - except OverflowError: - raise tslib.OutOfBoundsDatetime('Cannot generate range with ' - '{side}={endpoint} and ' - 'periods={periods}' - .format(side=side, endpoint=endpoint, - periods=periods)) - return other_end - - # ------------------------------------------------------------------- # Validation and Inference diff --git a/pandas/tests/indexes/datetimes/test_date_range.py b/pandas/tests/indexes/datetimes/test_date_range.py index d150a91fe7f18..a9bece248e9d0 100644 --- a/pandas/tests/indexes/datetimes/test_date_range.py +++ b/pandas/tests/indexes/datetimes/test_date_range.py @@ -88,6 +88,67 @@ def test_date_range_nat(self): with pytest.raises(ValueError, match=msg): date_range(start=pd.NaT, end='2016-01-01', freq='D') + def test_date_range_multiplication_overflow(self): + # GH#24255 + # check that overflows in calculating `addend = periods * stride` + # are caught + with tm.assert_produces_warning(None): + # we should _not_ be seeing a overflow RuntimeWarning + dti = date_range(start='1677-09-22', periods=213503, freq='D') + + assert dti[0] == Timestamp('1677-09-22') + assert len(dti) == 213503 + + msg = "Cannot generate range with" + with pytest.raises(OutOfBoundsDatetime, match=msg): + date_range('1969-05-04', periods=200000000, freq='30000D') + + def test_date_range_unsigned_overflow_handling(self): + # GH#24255 + # case where `addend = periods * stride` overflows int64 bounds + # but not uint64 bounds + dti = date_range(start='1677-09-22', end='2262-04-11', freq='D') + + dti2 = date_range(start=dti[0], periods=len(dti), freq='D') + assert dti2.equals(dti) + + dti3 = date_range(end=dti[-1], periods=len(dti), freq='D') + assert dti3.equals(dti) + + def test_date_range_int64_overflow_non_recoverable(self): + # GH#24255 + # case with start later than 1970-01-01, overflow int64 but not uint64 + msg = "Cannot generate range with" + with pytest.raises(OutOfBoundsDatetime, match=msg): + date_range(start='1970-02-01', periods=106752 * 24, freq='H') + + # case with end before 1970-01-01, overflow int64 but not uint64 + with pytest.raises(OutOfBoundsDatetime, match=msg): + date_range(end='1969-11-14', periods=106752 * 24, freq='H') + + def test_date_range_int64_overflow_stride_endpoint_different_signs(self): + # cases where stride * periods overflow int64 and stride/endpoint + # have different signs + start = Timestamp('2262-02-23') + end = Timestamp('1969-11-14') + + expected = date_range(start=start, end=end, freq='-1H') + assert expected[0] == start + assert expected[-1] == end + + dti = date_range(end=end, periods=len(expected), freq='-1H') + tm.assert_index_equal(dti, expected) + + start2 = Timestamp('1970-02-01') + end2 = Timestamp('1677-10-22') + + expected2 = date_range(start=start2, end=end2, freq='-1H') + assert expected2[0] == start2 + assert expected2[-1] == end2 + + dti2 = date_range(start=start2, periods=len(expected2), freq='-1H') + tm.assert_index_equal(dti2, expected2) + def test_date_range_out_of_bounds(self): # GH#14187 with pytest.raises(OutOfBoundsDatetime): From 0d2cd533bf93260d866dc8ac3479ba3797ac6a14 Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Sun, 23 Dec 2018 23:06:00 +0000 Subject: [PATCH 14/15] REF/TST: Add more pytest idiom to resample/test_base.py (#24377) --- pandas/tests/resample/conftest.py | 14 +- pandas/tests/resample/test_base.py | 399 +++++++++++++---------------- 2 files changed, 193 insertions(+), 220 deletions(-) diff --git a/pandas/tests/resample/conftest.py b/pandas/tests/resample/conftest.py index d5b32891ea1df..b84f88da85cc0 100644 --- a/pandas/tests/resample/conftest.py +++ b/pandas/tests/resample/conftest.py @@ -86,10 +86,22 @@ def series(index, _series_name, _static_values): @pytest.fixture -def frame(index, _static_values): +def empty_series(series): + return series[:0] + + +@pytest.fixture +def frame(index, _series_name, _static_values): + # _series_name is intentionally unused return DataFrame({'value': _static_values}, index=index) +@pytest.fixture +def empty_frame(series): + index = series.index[:0] + return DataFrame(index=index) + + @pytest.fixture(params=[Series, DataFrame]) def series_and_frame(request, series, frame): if request.param == Series: diff --git a/pandas/tests/resample/test_base.py b/pandas/tests/resample/test_base.py index 74003151abfb2..31199dc01b659 100644 --- a/pandas/tests/resample/test_base.py +++ b/pandas/tests/resample/test_base.py @@ -1,5 +1,3 @@ -# pylint: disable=E1101 - from datetime import datetime, timedelta import numpy as np @@ -19,6 +17,27 @@ assert_almost_equal, assert_frame_equal, assert_index_equal, assert_series_equal) +# a fixture value can be overridden by the test parameter value. Note that the +# value of the fixture can be overridden this way even if the test doesn't use +# it directly (doesn't mention it in the function prototype). +# see https://docs.pytest.org/en/latest/fixture.html#override-a-fixture-with-direct-test-parametrization # noqa +# in this module we override the fixture values defined in conftest.py +# tuples of '_index_factory,_series_name,_index_start,_index_end' +DATE_RANGE = (date_range, 'dti', datetime(2005, 1, 1), datetime(2005, 1, 10)) +PERIOD_RANGE = ( + period_range, 'pi', datetime(2005, 1, 1), datetime(2005, 1, 10)) +TIMEDELTA_RANGE = (timedelta_range, 'tdi', '1 day', '10 day') + +ALL_TIMESERIES_INDEXES = [DATE_RANGE, PERIOD_RANGE, TIMEDELTA_RANGE] + + +def pytest_generate_tests(metafunc): + # called once per each test function + if metafunc.function.__name__.endswith('_all_ts'): + metafunc.parametrize( + '_index_factory,_series_name,_index_start,_index_end', + ALL_TIMESERIES_INDEXES) + @pytest.fixture def create_index(_index_factory): @@ -28,234 +47,176 @@ def _create_index(*args, **kwargs): return _create_index -class Base(object): - """ - base class for resampling testing, calling - .create_series() generates a series of each index type - """ - - @pytest.mark.parametrize('freq', ['2D', '1H']) - def test_asfreq(self, series_and_frame, freq, create_index): - obj = series_and_frame - - result = obj.resample(freq).asfreq() - new_index = create_index(obj.index[0], obj.index[-1], freq=freq) - expected = obj.reindex(new_index) - assert_almost_equal(result, expected) - - def test_asfreq_fill_value(self, create_index): - # test for fill value during resampling, issue 3715 - - s = self.create_series() - - result = s.resample('1H').asfreq() - new_index = create_index(s.index[0], s.index[-1], freq='1H') - expected = s.reindex(new_index) - assert_series_equal(result, expected) - - frame = s.to_frame('value') - frame.iloc[1] = None - result = frame.resample('1H').asfreq(fill_value=4.0) - new_index = create_index(frame.index[0], - frame.index[-1], freq='1H') - expected = frame.reindex(new_index, fill_value=4.0) - assert_frame_equal(result, expected) - - def test_resample_interpolate(self): - # # 12925 - df = self.create_series().to_frame('value') - assert_frame_equal( - df.resample('1T').asfreq().interpolate(), - df.resample('1T').interpolate()) - - def test_raises_on_non_datetimelike_index(self): - # this is a non datetimelike index - xp = DataFrame() - pytest.raises(TypeError, lambda: xp.resample('A').mean()) - - @pytest.mark.parametrize('freq', ['M', 'D', 'H']) - def test_resample_empty_series(self, freq, resample_method): - # GH12771 & GH12868 - - if resample_method == 'ohlc': - pytest.skip('need to test for ohlc from GH13083') - - s = self.create_series()[:0] - result = getattr(s.resample(freq), resample_method)() - - expected = s.copy() - expected.index = s.index._shallow_copy(freq=freq) - assert_index_equal(result.index, expected.index) - assert result.index.freq == expected.index.freq - assert_series_equal(result, expected, check_dtype=False) - - @pytest.mark.parametrize('freq', ['M', 'D', 'H']) - def test_resample_empty_dataframe(self, freq, resample_method): - # GH13212 - index = self.create_series().index[:0] - f = DataFrame(index=index) - - # count retains dimensions too - result = getattr(f.resample(freq), resample_method)() - if resample_method != 'size': - expected = f.copy() - else: - # GH14962 - expected = Series([]) - - expected.index = f.index._shallow_copy(freq=freq) - assert_index_equal(result.index, expected.index) - assert result.index.freq == expected.index.freq - assert_almost_equal(result, expected, check_dtype=False) - - # test size for GH13212 (currently stays as df) - - @pytest.mark.parametrize("index", tm.all_timeseries_index_generator(0)) - @pytest.mark.parametrize( - "dtype", - [np.float, np.int, np.object, 'datetime64[ns]']) - def test_resample_empty_dtypes(self, index, dtype, resample_method): - - # Empty series were sometimes causing a segfault (for the functions - # with Cython bounds-checking disabled) or an IndexError. We just run - # them to ensure they no longer do. (GH #10228) - empty_series = Series([], index, dtype) - try: - getattr(empty_series.resample('d'), resample_method)() - except DataError: - # Ignore these since some combinations are invalid - # (ex: doing mean with dtype of np.object) - pass - - def test_resample_loffset_arg_type(self, create_index): - # GH 13218, 15002 - df = self.create_series().to_frame('value') - expected_means = [df.values[i:i + 2].mean() - for i in range(0, len(df.values), 2)] - expected_index = create_index(df.index[0], - periods=len(df.index) / 2, - freq='2D') - - # loffset coerces PeriodIndex to DateTimeIndex - if isinstance(expected_index, PeriodIndex): - expected_index = expected_index.to_timestamp() - - expected_index += timedelta(hours=2) - expected = DataFrame({'value': expected_means}, index=expected_index) - - for arg in ['mean', {'value': 'mean'}, ['mean']]: - - result_agg = df.resample('2D', loffset='2H').agg(arg) - - with tm.assert_produces_warning(FutureWarning, - check_stacklevel=False): - result_how = df.resample('2D', how=arg, loffset='2H') - - if isinstance(arg, list): - expected.columns = pd.MultiIndex.from_tuples([('value', - 'mean')]) - - # GH 13022, 7687 - TODO: fix resample w/ TimedeltaIndex - if isinstance(expected.index, TimedeltaIndex): - with pytest.raises(AssertionError): - assert_frame_equal(result_agg, expected) - assert_frame_equal(result_how, expected) - else: - assert_frame_equal(result_agg, expected) - assert_frame_equal(result_how, expected) - - def test_apply_to_empty_series(self): - # GH 14313 - series = self.create_series()[:0] - - for freq in ['M', 'D', 'H']: - result = series.resample(freq).apply(lambda x: 1) - expected = series.resample(freq).apply(np.sum) - - assert_series_equal(result, expected, check_dtype=False) - - def test_resampler_is_iterable(self): - # GH 15314 - series = self.create_series() - freq = 'H' - tg = TimeGrouper(freq, convention='start') - grouped = series.groupby(tg) - resampled = series.resample(freq) - for (rk, rv), (gk, gv) in zip(resampled, grouped): - assert rk == gk - assert_series_equal(rv, gv) - - def test_resample_quantile(self): - # GH 15023 - s = self.create_series() - q = 0.75 - freq = 'H' - result = s.resample(freq).quantile(q) - expected = s.resample(freq).agg(lambda x: x.quantile(q)) - tm.assert_series_equal(result, expected) - - -class TestDatetimeIndex(Base): - @pytest.fixture - def _index_factory(self): - return date_range - - @pytest.fixture - def _series_name(self): - return 'dti' - - def create_series(self): - i = date_range(datetime(2005, 1, 1), - datetime(2005, 1, 10), freq='D') - - return Series(np.arange(len(i)), index=i, name='dti') +@pytest.mark.parametrize('freq', ['2D', '1H']) +@pytest.mark.parametrize( + '_index_factory,_series_name,_index_start,_index_end', + [DATE_RANGE, TIMEDELTA_RANGE] +) +def test_asfreq(series_and_frame, freq, create_index): + obj = series_and_frame + + result = obj.resample(freq).asfreq() + new_index = create_index(obj.index[0], obj.index[-1], freq=freq) + expected = obj.reindex(new_index) + assert_almost_equal(result, expected) + + +@pytest.mark.parametrize( + '_index_factory,_series_name,_index_start,_index_end', + [DATE_RANGE, TIMEDELTA_RANGE] +) +def test_asfreq_fill_value(series, create_index): + # test for fill value during resampling, issue 3715 + + s = series + + result = s.resample('1H').asfreq() + new_index = create_index(s.index[0], s.index[-1], freq='1H') + expected = s.reindex(new_index) + assert_series_equal(result, expected) + + frame = s.to_frame('value') + frame.iloc[1] = None + result = frame.resample('1H').asfreq(fill_value=4.0) + new_index = create_index(frame.index[0], + frame.index[-1], freq='1H') + expected = frame.reindex(new_index, fill_value=4.0) + assert_frame_equal(result, expected) + + +def test_resample_interpolate_all_ts(frame): + # # 12925 + df = frame + assert_frame_equal( + df.resample('1T').asfreq().interpolate(), + df.resample('1T').interpolate()) + + +def test_raises_on_non_datetimelike_index(): + # this is a non datetimelike index + xp = DataFrame() + pytest.raises(TypeError, lambda: xp.resample('A').mean()) + + +@pytest.mark.parametrize('freq', ['M', 'D', 'H']) +def test_resample_empty_series_all_ts(freq, empty_series, resample_method): + # GH12771 & GH12868 + + if resample_method == 'ohlc': + pytest.skip('need to test for ohlc from GH13083') + + s = empty_series + result = getattr(s.resample(freq), resample_method)() + + expected = s.copy() + expected.index = s.index._shallow_copy(freq=freq) + assert_index_equal(result.index, expected.index) + assert result.index.freq == expected.index.freq + assert_series_equal(result, expected, check_dtype=False) + + +@pytest.mark.parametrize('freq', ['M', 'D', 'H']) +def test_resample_empty_dataframe_all_ts(empty_frame, freq, resample_method): + # GH13212 + df = empty_frame + # count retains dimensions too + result = getattr(df.resample(freq), resample_method)() + if resample_method != 'size': + expected = df.copy() + else: + # GH14962 + expected = Series([]) + + expected.index = df.index._shallow_copy(freq=freq) + assert_index_equal(result.index, expected.index) + assert result.index.freq == expected.index.freq + assert_almost_equal(result, expected, check_dtype=False) + + # test size for GH13212 (currently stays as df) + + +@pytest.mark.parametrize("index", tm.all_timeseries_index_generator(0)) +@pytest.mark.parametrize( + "dtype", + [np.float, np.int, np.object, 'datetime64[ns]']) +def test_resample_empty_dtypes(index, dtype, resample_method): + + # Empty series were sometimes causing a segfault (for the functions + # with Cython bounds-checking disabled) or an IndexError. We just run + # them to ensure they no longer do. (GH #10228) + empty_series = Series([], index, dtype) + try: + getattr(empty_series.resample('d'), resample_method)() + except DataError: + # Ignore these since some combinations are invalid + # (ex: doing mean with dtype of np.object) + pass -class TestPeriodIndex(Base): - @pytest.fixture - def _index_factory(self): - return period_range +def test_resample_loffset_arg_type_all_ts(frame, create_index): + # GH 13218, 15002 + df = frame + expected_means = [df.values[i:i + 2].mean() + for i in range(0, len(df.values), 2)] + expected_index = create_index(df.index[0], + periods=len(df.index) / 2, + freq='2D') - @pytest.fixture - def _series_name(self): - return 'pi' + # loffset coerces PeriodIndex to DateTimeIndex + if isinstance(expected_index, PeriodIndex): + expected_index = expected_index.to_timestamp() - def create_series(self): - # TODO: replace calls to .create_series() by injecting the series - # fixture - i = period_range(datetime(2005, 1, 1), - datetime(2005, 1, 10), freq='D') + expected_index += timedelta(hours=2) + expected = DataFrame({'value': expected_means}, index=expected_index) - return Series(np.arange(len(i)), index=i, name='pi') + for arg in ['mean', {'value': 'mean'}, ['mean']]: - @pytest.mark.skip() - def test_asfreq(self): - pass + result_agg = df.resample('2D', loffset='2H').agg(arg) - @pytest.mark.skip() - def test_asfreq_fill_value(self): - pass + with tm.assert_produces_warning(FutureWarning, + check_stacklevel=False): + result_how = df.resample('2D', how=arg, loffset='2H') + if isinstance(arg, list): + expected.columns = pd.MultiIndex.from_tuples([('value', + 'mean')]) -class TestTimedeltaIndex(Base): - @pytest.fixture - def _index_factory(self): - return timedelta_range + # GH 13022, 7687 - TODO: fix resample w/ TimedeltaIndex + if isinstance(expected.index, TimedeltaIndex): + with pytest.raises(AssertionError): + assert_frame_equal(result_agg, expected) + assert_frame_equal(result_how, expected) + else: + assert_frame_equal(result_agg, expected) + assert_frame_equal(result_how, expected) - @pytest.fixture - def _index_start(self): - return '1 day' - @pytest.fixture - def _index_end(self): - return '10 day' +def test_apply_to_empty_series_all_ts(empty_series): + # GH 14313 + s = empty_series + for freq in ['M', 'D', 'H']: + result = s.resample(freq).apply(lambda x: 1) + expected = s.resample(freq).apply(np.sum) - @pytest.fixture - def _series_name(self): - return 'tdi' + assert_series_equal(result, expected, check_dtype=False) - def create_series(self): - i = timedelta_range('1 day', - '10 day', freq='D') - return Series(np.arange(len(i)), index=i, name='tdi') +def test_resampler_is_iterable_all_ts(series): + # GH 15314 + freq = 'H' + tg = TimeGrouper(freq, convention='start') + grouped = series.groupby(tg) + resampled = series.resample(freq) + for (rk, rv), (gk, gv) in zip(resampled, grouped): + assert rk == gk + assert_series_equal(rv, gv) + + +def test_resample_quantile_all_ts(series): + # GH 15023 + s = series + q = 0.75 + freq = 'H' + result = s.resample(freq).quantile(q) + expected = s.resample(freq).agg(lambda x: x.quantile(q)) + tm.assert_series_equal(result, expected) From fc7bc3f74d1f5702bf66c519ad190126538a8f5b Mon Sep 17 00:00:00 2001 From: Jeremy Schendel Date: Sun, 23 Dec 2018 16:08:19 -0700 Subject: [PATCH 15/15] CLN: Make repeat method consistent (#24395) --- pandas/core/arrays/base.py | 52 ++++++++++++++----- pandas/core/arrays/categorical.py | 12 ++--- pandas/core/arrays/interval.py | 39 +++----------- pandas/core/arrays/period.py | 14 ----- pandas/core/indexes/base.py | 35 ++++++++----- pandas/core/indexes/datetimelike.py | 12 ++--- pandas/core/indexes/multi.py | 1 + pandas/core/indexes/period.py | 4 -- pandas/core/series.py | 52 +++++++++++++++++-- .../arrays/categorical/test_analytics.py | 16 ------ pandas/tests/arrays/interval/test_interval.py | 16 ------ pandas/tests/extension/base/methods.py | 42 +++++++-------- pandas/tests/indexes/period/test_ops.py | 20 ------- pandas/tests/indexes/period/test_period.py | 29 +++++------ 14 files changed, 157 insertions(+), 187 deletions(-) diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index a848dafbb06ef..5311d6b8d9d90 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -12,6 +12,7 @@ from pandas.compat import PY3, set_function_name from pandas.compat.numpy import function as nv from pandas.errors import AbstractMethodError +from pandas.util._decorators import Appender, Substitution from pandas.core.dtypes.common import is_list_like from pandas.core.dtypes.generic import ABCIndexClass, ABCSeries @@ -20,6 +21,8 @@ _not_implemented_message = "{} does not implement {}." +_extension_array_shared_docs = dict() + class ExtensionArray(object): """ @@ -580,32 +583,55 @@ def factorize(self, na_sentinel=-1): uniques = self._from_factorized(uniques, self) return labels, uniques - def repeat(self, repeats, axis=None): - """ - Repeat elements of an array. + _extension_array_shared_docs['repeat'] = """ + Repeat elements of a %(klass)s. - .. versionadded:: 0.24.0 + Returns a new %(klass)s where each element of the current %(klass)s + is repeated consecutively a given number of times. Parameters ---------- - repeats : int - This should be a non-negative integer. Repeating 0 times - will return an empty array. + repeats : int or array of ints + The number of repetitions for each element. This should be a + non-negative integer. Repeating 0 times will return an empty + %(klass)s. + *args + Additional arguments have no effect but might be accepted for + compatibility with numpy. + **kwargs + Additional keywords have no effect but might be accepted for + compatibility with numpy. Returns ------- - repeated_array : ExtensionArray - Same type as the input, with elements repeated `repeats` times. + repeated_array : %(klass)s + Newly created %(klass)s with repeated elements. See Also -------- + Series.repeat : Equivalent function for Series. + Index.repeat : Equivalent function for Index. numpy.repeat : Similar method for :class:`numpy.ndarray`. ExtensionArray.take : Take arbitrary positions. + + Examples + -------- + >>> cat = pd.Categorical(['a', 'b', 'c']) + >>> cat + [a, b, c] + Categories (3, object): [a, b, c] + >>> cat.repeat(2) + [a, a, b, b, c, c] + Categories (3, object): [a, b, c] + >>> cat.repeat([1, 2, 3]) + [a, b, b, c, c, c] + Categories (3, object): [a, b, c] """ - if axis is not None: - raise ValueError("'axis' must be None.") - if repeats < 0: - raise ValueError("negative repeats are not allowed.") + + @Substitution(klass='ExtensionArray') + @Appender(_extension_array_shared_docs['repeat']) + def repeat(self, repeats, *args, **kwargs): + nv.validate_repeat(args, kwargs) ind = np.arange(len(self)).repeat(repeats) return self.take(ind) diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 9a8b345cea1b3..62362e643b9ae 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -39,7 +39,7 @@ from pandas.io.formats import console from pandas.io.formats.terminal import get_terminal_size -from .base import ExtensionArray +from .base import ExtensionArray, _extension_array_shared_docs _take_msg = textwrap.dedent("""\ Interpreting negative values in 'indexer' as missing values. @@ -2394,15 +2394,9 @@ def describe(self): return result + @Substitution(klass='Categorical') + @Appender(_extension_array_shared_docs['repeat']) def repeat(self, repeats, *args, **kwargs): - """ - Repeat elements of a Categorical. - - See Also - -------- - numpy.ndarray.repeat - - """ nv.validate_repeat(args, kwargs) codes = self._codes.repeat(repeats) return self._constructor(values=codes, dtype=self.dtype, fastpath=True) diff --git a/pandas/core/arrays/interval.py b/pandas/core/arrays/interval.py index 1a1648a3b8480..d67645c8b4451 100644 --- a/pandas/core/arrays/interval.py +++ b/pandas/core/arrays/interval.py @@ -20,12 +20,13 @@ ABCDatetimeIndex, ABCInterval, ABCIntervalIndex, ABCPeriodIndex, ABCSeries) from pandas.core.dtypes.missing import isna, notna +from pandas.core.arrays.base import ( + ExtensionArray, _extension_array_shared_docs) +from pandas.core.arrays.categorical import Categorical import pandas.core.common as com from pandas.core.config import get_option from pandas.core.indexes.base import Index, ensure_index -from . import Categorical, ExtensionArray - _VALID_CLOSED = {'left', 'right', 'both', 'neither'} _interval_shared_docs = {} @@ -1000,35 +1001,11 @@ def to_tuples(self, na_tuple=True): tuples = np.where(~self.isna(), tuples, np.nan) return tuples - def repeat(self, repeats, **kwargs): - """ - Repeat elements of an IntervalArray. - - Returns a new IntervalArray where each element of the current - IntervalArray is repeated consecutively a given number of times. - - Parameters - ---------- - repeats : int - The number of repetitions for each element. - - **kwargs - Additional keywords have no effect but might be accepted for - compatibility with numpy. - - Returns - ------- - IntervalArray - Newly created IntervalArray with repeated elements. - - See Also - -------- - Index.repeat : Equivalent function for Index. - Series.repeat : Equivalent function for Series. - numpy.repeat : Underlying implementation. - """ - left_repeat = self.left.repeat(repeats, **kwargs) - right_repeat = self.right.repeat(repeats, **kwargs) + @Appender(_extension_array_shared_docs['repeat'] % _shared_docs_kwargs) + def repeat(self, repeats, *args, **kwargs): + nv.validate_repeat(args, kwargs) + left_repeat = self.left.repeat(repeats) + right_repeat = self.right.repeat(repeats) return self._shallow_copy(left=left_repeat, right=right_repeat) _interval_shared_docs['overlaps'] = """ diff --git a/pandas/core/arrays/period.py b/pandas/core/arrays/period.py index 16951275707cc..5f4d98a81e5f2 100644 --- a/pandas/core/arrays/period.py +++ b/pandas/core/arrays/period.py @@ -11,7 +11,6 @@ period_asfreq_arr) from pandas._libs.tslibs.timedeltas import Timedelta, delta_to_nanoseconds import pandas.compat as compat -from pandas.compat.numpy import function as nv from pandas.util._decorators import Appender, cache_readonly from pandas.util._validators import validate_fillna_kwargs @@ -593,19 +592,6 @@ def _format_native_types(self, na_rep=u'NaT', date_format=None, **kwargs): # ------------------------------------------------------------------ - def repeat(self, repeats, *args, **kwargs): - """ - Repeat elements of a PeriodArray. - - See Also - -------- - numpy.ndarray.repeat - """ - # TODO(DatetimeArray): remove - nv.validate_repeat(args, kwargs) - values = self._data.repeat(repeats) - return type(self)(values, self.freq) - def astype(self, dtype, copy=True): # TODO: Figure out something better here... # We have DatetimeLikeArrayMixin -> diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 478902fe53e58..e0ed0ca28c6ff 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -834,41 +834,48 @@ def _assert_take_fillable(self, values, indices, allow_fill=True, taken = values.take(indices) return taken - def repeat(self, repeats, *args, **kwargs): - """ - Repeat elements of an Index. + _index_shared_docs['repeat'] = """ + Repeat elements of a %(klass)s. - Returns a new index where each element of the current index + Returns a new %(klass)s where each element of the current %(klass)s is repeated consecutively a given number of times. Parameters ---------- - repeats : int - The number of repetitions for each element. + repeats : int or array of ints + The number of repetitions for each element. This should be a + non-negative integer. Repeating 0 times will return an empty + %(klass)s. + *args + Additional arguments have no effect but might be accepted for + compatibility with numpy. **kwargs Additional keywords have no effect but might be accepted for compatibility with numpy. Returns ------- - pandas.Index - Newly created Index with repeated elements. + repeated_index : %(klass)s + Newly created %(klass)s with repeated elements. See Also -------- Series.repeat : Equivalent function for Series. - numpy.repeat : Underlying implementation. + numpy.repeat : Similar method for :class:`numpy.ndarray`. Examples -------- - >>> idx = pd.Index([1, 2, 3]) + >>> idx = pd.Index(['a', 'b', 'c']) >>> idx - Int64Index([1, 2, 3], dtype='int64') + Index(['a', 'b', 'c'], dtype='object') >>> idx.repeat(2) - Int64Index([1, 1, 2, 2, 3, 3], dtype='int64') - >>> idx.repeat(3) - Int64Index([1, 1, 1, 2, 2, 2, 3, 3, 3], dtype='int64') + Index(['a', 'a', 'b', 'b', 'c', 'c'], dtype='object') + >>> idx.repeat([1, 2, 3]) + Index(['a', 'b', 'b', 'c', 'c', 'c'], dtype='object') """ + + @Appender(_index_shared_docs['repeat'] % _index_doc_kwargs) + def repeat(self, repeats, *args, **kwargs): nv.validate_repeat(args, kwargs) return self._shallow_copy(self._values.repeat(repeats)) diff --git a/pandas/core/indexes/datetimelike.py b/pandas/core/indexes/datetimelike.py index 86fa7f785914f..8a319a65314dd 100644 --- a/pandas/core/indexes/datetimelike.py +++ b/pandas/core/indexes/datetimelike.py @@ -450,17 +450,11 @@ def isin(self, values): return algorithms.isin(self.asi8, values.asi8) + @Appender(_index_shared_docs['repeat'] % _index_doc_kwargs) def repeat(self, repeats, *args, **kwargs): - """ - Analogous to ndarray.repeat. - """ nv.validate_repeat(args, kwargs) - if is_period_dtype(self): - freq = self.freq - else: - freq = None - return self._shallow_copy(self.asi8.repeat(repeats), - freq=freq) + freq = self.freq if is_period_dtype(self) else None + return self._shallow_copy(self.asi8.repeat(repeats), freq=freq) @Appender(_index_shared_docs['where'] % _index_doc_kwargs) def where(self, cond, other=None): diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 86ef3695ee292..ef4a85e964cad 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -1857,6 +1857,7 @@ def append(self, other): def argsort(self, *args, **kwargs): return self.values.argsort(*args, **kwargs) + @Appender(_index_shared_docs['repeat'] % _index_doc_kwargs) def repeat(self, repeats, *args, **kwargs): nv.validate_repeat(args, kwargs) return MultiIndex(levels=self.levels, diff --git a/pandas/core/indexes/period.py b/pandas/core/indexes/period.py index 17666cd651a50..b15604a57fb81 100644 --- a/pandas/core/indexes/period.py +++ b/pandas/core/indexes/period.py @@ -924,10 +924,6 @@ def wrapper(self, other): wrapper.__name__ = '__{}__'.format(op.__name__) return wrapper - def repeat(self, repeats, *args, **kwargs): - # TODO(DatetimeArray): Just use Index.repeat - return Index.repeat(self, repeats, *args, **kwargs) - def view(self, dtype=None, type=None): # TODO(DatetimeArray): remove if dtype is None or dtype is __builtins__['type'](self): diff --git a/pandas/core/series.py b/pandas/core/series.py index d642a221e4494..dacd587e7e73f 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -1038,12 +1038,58 @@ def _set_values(self, key, value): def repeat(self, repeats, *args, **kwargs): """ - Repeat elements of an Series. Refer to `numpy.ndarray.repeat` - for more information about the `repeats` argument. + Repeat elements of a Series. + + Returns a new Series where each element of the current Series + is repeated consecutively a given number of times. + + Parameters + ---------- + repeats : int or array of ints + The number of repetitions for each element. This should be a + non-negative integer. Repeating 0 times will return an empty + Series. + *args + Additional arguments have no effect but might be accepted for + compatibility with numpy. + **kwargs + Additional keywords have no effect but might be accepted for + compatibility with numpy. + + Returns + ------- + repeated_series : Series + Newly created Series with repeated elements. See Also -------- - numpy.ndarray.repeat + Index.repeat : Equivalent function for Index. + numpy.repeat : Similar method for :class:`numpy.ndarray`. + + Examples + -------- + >>> s = pd.Series(['a', 'b', 'c']) + >>> s + 0 a + 1 b + 2 c + dtype: object + >>> s.repeat(2) + 0 a + 0 a + 1 b + 1 b + 2 c + 2 c + dtype: object + >>> s.repeat([1, 2, 3]) + 0 a + 1 b + 1 b + 2 c + 2 c + 2 c + dtype: object """ nv.validate_repeat(args, kwargs) new_index = self.index.repeat(repeats) diff --git a/pandas/tests/arrays/categorical/test_analytics.py b/pandas/tests/arrays/categorical/test_analytics.py index b2c9151e1fa94..5efcd527de8d8 100644 --- a/pandas/tests/arrays/categorical/test_analytics.py +++ b/pandas/tests/arrays/categorical/test_analytics.py @@ -295,22 +295,6 @@ def test_validate_inplace(self): with pytest.raises(ValueError): cat.sort_values(inplace=value) - def test_repeat(self): - # GH10183 - cat = Categorical(["a", "b"], categories=["a", "b"]) - exp = Categorical(["a", "a", "b", "b"], categories=["a", "b"]) - res = cat.repeat(2) - tm.assert_categorical_equal(res, exp) - - def test_numpy_repeat(self): - cat = Categorical(["a", "b"], categories=["a", "b"]) - exp = Categorical(["a", "a", "b", "b"], categories=["a", "b"]) - tm.assert_categorical_equal(np.repeat(cat, 2), exp) - - msg = "the 'axis' parameter is not supported" - with pytest.raises(ValueError, match=msg): - np.repeat(cat, 2, axis=1) - def test_isna(self): exp = np.array([False, False, True]) c = Categorical(["a", "b", np.nan]) diff --git a/pandas/tests/arrays/interval/test_interval.py b/pandas/tests/arrays/interval/test_interval.py index 9604010571294..e81e64d90ff5f 100644 --- a/pandas/tests/arrays/interval/test_interval.py +++ b/pandas/tests/arrays/interval/test_interval.py @@ -26,22 +26,6 @@ def left_right_dtypes(request): class TestMethods(object): - @pytest.mark.parametrize('repeats', [0, 1, 5]) - def test_repeat(self, left_right_dtypes, repeats): - left, right = left_right_dtypes - result = IntervalArray.from_arrays(left, right).repeat(repeats) - expected = IntervalArray.from_arrays( - left.repeat(repeats), right.repeat(repeats)) - tm.assert_extension_array_equal(result, expected) - - @pytest.mark.parametrize('bad_repeats, msg', [ - (-1, 'negative dimensions are not allowed'), - ('foo', r'invalid literal for (int|long)\(\) with base 10')]) - def test_repeat_errors(self, bad_repeats, msg): - array = IntervalArray.from_breaks(range(4)) - with pytest.raises(ValueError, match=msg): - array.repeat(bad_repeats) - @pytest.mark.parametrize('new_closed', [ 'left', 'right', 'both', 'neither']) def test_set_closed(self, closed, new_closed): diff --git a/pandas/tests/extension/base/methods.py b/pandas/tests/extension/base/methods.py index 3403d0e9e02f1..bd59a9d3c4b16 100644 --- a/pandas/tests/extension/base/methods.py +++ b/pandas/tests/extension/base/methods.py @@ -265,33 +265,33 @@ def test_where_series(self, data, na_value, as_frame): expected = expected.to_frame(name='a') self.assert_equal(result, expected) + @pytest.mark.parametrize("use_numpy", [True, False]) @pytest.mark.parametrize("as_series", [True, False]) - @pytest.mark.parametrize("repeats", [0, 1, 2]) - def test_repeat(self, data, repeats, as_series): - a, b, c = data[:3] - arr = type(data)._from_sequence([a, b, c], dtype=data.dtype) - + @pytest.mark.parametrize("repeats", [0, 1, 2, [1, 2, 3]]) + def test_repeat(self, data, repeats, as_series, use_numpy): + arr = type(data)._from_sequence(data[:3], dtype=data.dtype) if as_series: arr = pd.Series(arr) - result = arr.repeat(repeats) + result = np.repeat(arr, repeats) if use_numpy else arr.repeat(repeats) - if repeats == 0: - expected = [] - elif repeats == 1: - expected = [a, b, c] - else: - expected = [a, a, b, b, c, c] + repeats = [repeats] * 3 if isinstance(repeats, int) else repeats + expected = [x for x, n in zip(arr, repeats) for _ in range(n)] expected = type(data)._from_sequence(expected, dtype=data.dtype) if as_series: - index = pd.Series(np.arange(len(arr))).repeat(repeats).index - expected = pd.Series(expected, index=index) - self.assert_equal(result, expected) + expected = pd.Series(expected, index=arr.index.repeat(repeats)) - def test_repeat_raises(self, data): - with pytest.raises(ValueError, match="'axis'"): - data.repeat(2, axis=1) + self.assert_equal(result, expected) - with pytest.raises(ValueError, - match="negative"): - data.repeat(-1) + @pytest.mark.parametrize("use_numpy", [True, False]) + @pytest.mark.parametrize('repeats, kwargs, error, msg', [ + (2, dict(axis=1), ValueError, "'axis"), + (-1, dict(), ValueError, "negative"), + ([1, 2], dict(), ValueError, "shape"), + (2, dict(foo='bar'), TypeError, "'foo'")]) + def test_repeat_raises(self, data, repeats, kwargs, error, msg, use_numpy): + with pytest.raises(error, match=msg): + if use_numpy: + np.repeat(data, repeats, **kwargs) + else: + data.repeat(repeats, **kwargs) diff --git a/pandas/tests/indexes/period/test_ops.py b/pandas/tests/indexes/period/test_ops.py index 6648be5d2818a..eebff39fdf46f 100644 --- a/pandas/tests/indexes/period/test_ops.py +++ b/pandas/tests/indexes/period/test_ops.py @@ -317,26 +317,6 @@ def test_shift(self): # This is tested in test_arithmetic pass - def test_repeat(self): - index = pd.period_range('2001-01-01', periods=2, freq='D') - exp = pd.PeriodIndex(['2001-01-01', '2001-01-01', - '2001-01-02', '2001-01-02'], freq='D') - for res in [index.repeat(2), np.repeat(index, 2)]: - tm.assert_index_equal(res, exp) - - index = pd.period_range('2001-01-01', periods=2, freq='2D') - exp = pd.PeriodIndex(['2001-01-01', '2001-01-01', - '2001-01-03', '2001-01-03'], freq='2D') - for res in [index.repeat(2), np.repeat(index, 2)]: - tm.assert_index_equal(res, exp) - - index = pd.PeriodIndex(['2001-01', 'NaT', '2003-01'], freq='M') - exp = pd.PeriodIndex(['2001-01', '2001-01', '2001-01', - 'NaT', 'NaT', 'NaT', - '2003-01', '2003-01', '2003-01'], freq='M') - for res in [index.repeat(3), np.repeat(index, 3)]: - tm.assert_index_equal(res, exp) - def test_nat(self): assert pd.PeriodIndex._na_value is NaT assert pd.PeriodIndex([], freq='M')._na_value is NaT diff --git a/pandas/tests/indexes/period/test_period.py b/pandas/tests/indexes/period/test_period.py index a5169aba2db33..37bfb9c0606a3 100644 --- a/pandas/tests/indexes/period/test_period.py +++ b/pandas/tests/indexes/period/test_period.py @@ -37,13 +37,19 @@ def test_where(self): # This is handled in test_indexing pass - def test_repeat(self): + @pytest.mark.parametrize('use_numpy', [True, False]) + @pytest.mark.parametrize('index', [ + pd.period_range('2000-01-01', periods=3, freq='D'), + pytest.param( + pd.period_range('2001-01-01', periods=3, freq='2D'), + marks=pytest.mark.xfail(reason='GH 24391')), + pd.PeriodIndex(['2001-01', 'NaT', '2003-01'], freq='M')]) + def test_repeat_freqstr(self, index, use_numpy): # GH10183 - idx = pd.period_range('2000-01-01', periods=3, freq='D') - res = idx.repeat(3) - exp = PeriodIndex(idx.values.repeat(3), freq='D') - tm.assert_index_equal(res, exp) - assert res.freqstr == 'D' + expected = PeriodIndex([p for p in index for _ in range(3)]) + result = np.repeat(index, 3) if use_numpy else index.repeat(3) + tm.assert_index_equal(result, expected) + assert result.freqstr == index.freqstr def test_fillna_period(self): # GH 11343 @@ -444,17 +450,6 @@ def test_pindex_qaccess(self): # Todo: fix these accessors! assert s['05Q4'] == s[2] - def test_numpy_repeat(self): - index = period_range('20010101', periods=2) - expected = PeriodIndex([Period('2001-01-01'), Period('2001-01-01'), - Period('2001-01-02'), Period('2001-01-02')]) - - tm.assert_index_equal(np.repeat(index, 2), expected) - - msg = "the 'axis' parameter is not supported" - with pytest.raises(ValueError, match=msg): - np.repeat(index, 2, axis=1) - def test_pindex_multiples(self): pi = PeriodIndex(start='1/1/11', end='12/31/11', freq='2M') expected = PeriodIndex(['2011-01', '2011-03', '2011-05', '2011-07',