diff --git a/pandas/tests/reductions/__init__.py b/pandas/tests/reductions/__init__.py new file mode 100644 index 0000000000000..e3851753b6742 --- /dev/null +++ b/pandas/tests/reductions/__init__.py @@ -0,0 +1,4 @@ +""" +Tests for reductions where we want to test for matching behavior across +Array, Index, Series, and DataFrame methods. +""" diff --git a/pandas/tests/reductions/test_reductions.py b/pandas/tests/reductions/test_reductions.py new file mode 100644 index 0000000000000..e7f984919d80b --- /dev/null +++ b/pandas/tests/reductions/test_reductions.py @@ -0,0 +1,817 @@ +# -*- coding: utf-8 -*- +from datetime import datetime + +import numpy as np +import pytest + +import pandas as pd +from pandas import Categorical, DataFrame, Index, PeriodIndex, Series, compat +from pandas.core import nanops +import pandas.util.testing as tm + + +def get_objs(): + indexes = [ + tm.makeBoolIndex(10, name='a'), + tm.makeIntIndex(10, name='a'), + tm.makeFloatIndex(10, name='a'), + tm.makeDateIndex(10, name='a'), + tm.makeDateIndex(10, name='a').tz_localize(tz='US/Eastern'), + tm.makePeriodIndex(10, name='a'), + tm.makeStringIndex(10, name='a'), + tm.makeUnicodeIndex(10, name='a') + ] + + arr = np.random.randn(10) + series = [Series(arr, index=idx, name='a') for idx in indexes] + + objs = indexes + series + return objs + + +objs = get_objs() + + +class TestReductions(object): + + @pytest.mark.parametrize('opname', ['max', 'min']) + @pytest.mark.parametrize('obj', objs) + def test_ops(self, opname, obj): + result = getattr(obj, opname)() + if not isinstance(obj, PeriodIndex): + expected = getattr(obj.values, opname)() + else: + expected = pd.Period( + ordinal=getattr(obj._ndarray_values, opname)(), + freq=obj.freq) + try: + assert result == expected + except TypeError: + # comparing tz-aware series with np.array results in + # TypeError + expected = expected.astype('M8[ns]').astype('int64') + assert result.value == expected + + def test_nanops(self): + # GH#7261 + for opname in ['max', 'min']: + for klass in [Index, Series]: + + obj = klass([np.nan, 2.0]) + assert getattr(obj, opname)() == 2.0 + + obj = klass([np.nan]) + assert pd.isna(getattr(obj, opname)()) + + obj = klass([]) + assert pd.isna(getattr(obj, opname)()) + + obj = klass([pd.NaT, datetime(2011, 11, 1)]) + # check DatetimeIndex monotonic path + assert getattr(obj, opname)() == datetime(2011, 11, 1) + + obj = klass([pd.NaT, datetime(2011, 11, 1), pd.NaT]) + # check DatetimeIndex non-monotonic path + assert getattr(obj, opname)(), datetime(2011, 11, 1) + + # argmin/max + obj = Index(np.arange(5, dtype='int64')) + assert obj.argmin() == 0 + assert obj.argmax() == 4 + + obj = Index([np.nan, 1, np.nan, 2]) + assert obj.argmin() == 1 + assert obj.argmax() == 3 + + obj = Index([np.nan]) + assert obj.argmin() == -1 + assert obj.argmax() == -1 + + obj = Index([pd.NaT, datetime(2011, 11, 1), datetime(2011, 11, 2), + pd.NaT]) + assert obj.argmin() == 1 + assert obj.argmax() == 2 + + obj = Index([pd.NaT]) + assert obj.argmin() == -1 + assert obj.argmax() == -1 + + +class TestSeriesReductions(object): + # Note: the name TestSeriesReductions indicates these tests + # were moved from a series-specific test file, _not_ that these tests are + # intended long-term to be series-specific + + def test_sum_inf(self): + s = Series(np.random.randn(10)) + s2 = s.copy() + + s[5:8] = np.inf + s2[5:8] = np.nan + + assert np.isinf(s.sum()) + + arr = np.random.randn(100, 100).astype('f4') + arr[:, 2] = np.inf + + with pd.option_context("mode.use_inf_as_na", True): + tm.assert_almost_equal(s.sum(), s2.sum()) + + res = nanops.nansum(arr, axis=1) + assert np.isinf(res).all() + + @pytest.mark.parametrize("use_bottleneck", [True, False]) + @pytest.mark.parametrize("method, unit", [ + ("sum", 0.0), + ("prod", 1.0) + ]) + def test_empty(self, method, unit, use_bottleneck): + with pd.option_context("use_bottleneck", use_bottleneck): + # GH#9422 / GH#18921 + # Entirely empty + s = Series([]) + # NA by default + result = getattr(s, method)() + assert result == unit + + # Explicit + result = getattr(s, method)(min_count=0) + assert result == unit + + result = getattr(s, method)(min_count=1) + assert pd.isna(result) + + # Skipna, default + result = getattr(s, method)(skipna=True) + result == unit + + # Skipna, explicit + result = getattr(s, method)(skipna=True, min_count=0) + assert result == unit + + result = getattr(s, method)(skipna=True, min_count=1) + assert pd.isna(result) + + # All-NA + s = Series([np.nan]) + # NA by default + result = getattr(s, method)() + assert result == unit + + # Explicit + result = getattr(s, method)(min_count=0) + assert result == unit + + result = getattr(s, method)(min_count=1) + assert pd.isna(result) + + # Skipna, default + result = getattr(s, method)(skipna=True) + result == unit + + # skipna, explicit + result = getattr(s, method)(skipna=True, min_count=0) + assert result == unit + + result = getattr(s, method)(skipna=True, min_count=1) + assert pd.isna(result) + + # Mix of valid, empty + s = Series([np.nan, 1]) + # Default + result = getattr(s, method)() + assert result == 1.0 + + # Explicit + result = getattr(s, method)(min_count=0) + assert result == 1.0 + + result = getattr(s, method)(min_count=1) + assert result == 1.0 + + # Skipna + result = getattr(s, method)(skipna=True) + assert result == 1.0 + + result = getattr(s, method)(skipna=True, min_count=0) + assert result == 1.0 + + result = getattr(s, method)(skipna=True, min_count=1) + assert result == 1.0 + + # GH#844 (changed in GH#9422) + df = DataFrame(np.empty((10, 0))) + assert (getattr(df, method)(1) == unit).all() + + s = pd.Series([1]) + result = getattr(s, method)(min_count=2) + assert pd.isna(result) + + s = pd.Series([np.nan]) + result = getattr(s, method)(min_count=2) + assert pd.isna(result) + + s = pd.Series([np.nan, 1]) + result = getattr(s, method)(min_count=2) + assert pd.isna(result) + + @pytest.mark.parametrize('method, unit', [ + ('sum', 0.0), + ('prod', 1.0), + ]) + def test_empty_multi(self, method, unit): + s = pd.Series([1, np.nan, np.nan, np.nan], + index=pd.MultiIndex.from_product([('a', 'b'), (0, 1)])) + # 1 / 0 by default + result = getattr(s, method)(level=0) + expected = pd.Series([1, unit], index=['a', 'b']) + tm.assert_series_equal(result, expected) + + # min_count=0 + result = getattr(s, method)(level=0, min_count=0) + expected = pd.Series([1, unit], index=['a', 'b']) + tm.assert_series_equal(result, expected) + + # min_count=1 + result = getattr(s, method)(level=0, min_count=1) + expected = pd.Series([1, np.nan], index=['a', 'b']) + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize( + "method", ['mean', 'median', 'std', 'var']) + def test_ops_consistency_on_empty(self, method): + + # GH#7869 + # consistency on empty + + # float + result = getattr(Series(dtype=float), method)() + assert pd.isna(result) + + # timedelta64[ns] + result = getattr(Series(dtype='m8[ns]'), method)() + assert result is pd.NaT + + def test_nansum_buglet(self): + ser = Series([1.0, np.nan], index=[0, 1]) + result = np.nansum(ser) + tm.assert_almost_equal(result, 1) + + @pytest.mark.parametrize("use_bottleneck", [True, False]) + def test_sum_overflow(self, use_bottleneck): + + with pd.option_context('use_bottleneck', use_bottleneck): + # GH#6915 + # overflowing on the smaller int dtypes + for dtype in ['int32', 'int64']: + v = np.arange(5000000, dtype=dtype) + s = Series(v) + + result = s.sum(skipna=False) + assert int(result) == v.sum(dtype='int64') + result = s.min(skipna=False) + assert int(result) == 0 + result = s.max(skipna=False) + assert int(result) == v[-1] + + for dtype in ['float32', 'float64']: + v = np.arange(5000000, dtype=dtype) + s = Series(v) + + result = s.sum(skipna=False) + assert result == v.sum(dtype=dtype) + result = s.min(skipna=False) + assert np.allclose(float(result), 0.0) + result = s.max(skipna=False) + assert np.allclose(float(result), v[-1]) + + def test_empty_timeseries_reductions_return_nat(self): + # covers GH#11245 + for dtype in ('m8[ns]', 'm8[ns]', 'M8[ns]', 'M8[ns, UTC]'): + assert Series([], dtype=dtype).min() is pd.NaT + assert Series([], dtype=dtype).max() is pd.NaT + + def test_numpy_argmin_deprecated(self): + # See GH#16830 + data = np.arange(1, 11) + + s = Series(data, index=data) + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + # The deprecation of Series.argmin also causes a deprecation + # warning when calling np.argmin. This behavior is temporary + # until the implementation of Series.argmin is corrected. + result = np.argmin(s) + + assert result == 1 + + with tm.assert_produces_warning(FutureWarning): + # argmin is aliased to idxmin + result = s.argmin() + + assert result == 1 + + with tm.assert_produces_warning(FutureWarning, + check_stacklevel=False): + msg = "the 'out' parameter is not supported" + with pytest.raises(ValueError, match=msg): + np.argmin(s, out=data) + + def test_numpy_argmax_deprecated(self): + # See GH#16830 + data = np.arange(1, 11) + + s = Series(data, index=data) + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + # The deprecation of Series.argmax also causes a deprecation + # warning when calling np.argmax. This behavior is temporary + # until the implementation of Series.argmax is corrected. + result = np.argmax(s) + assert result == 10 + + with tm.assert_produces_warning(FutureWarning): + # argmax is aliased to idxmax + result = s.argmax() + + assert result == 10 + + with tm.assert_produces_warning(FutureWarning, + check_stacklevel=False): + msg = "the 'out' parameter is not supported" + with pytest.raises(ValueError, match=msg): + np.argmax(s, out=data) + + def test_idxmin(self): + # test idxmin + # _check_stat_op approach can not be used here because of isna check. + string_series = tm.makeStringSeries().rename('series') + + # add some NaNs + string_series[5:15] = np.NaN + + # skipna or no + assert string_series[string_series.idxmin()] == string_series.min() + assert pd.isna(string_series.idxmin(skipna=False)) + + # no NaNs + nona = string_series.dropna() + assert nona[nona.idxmin()] == nona.min() + assert (nona.index.values.tolist().index(nona.idxmin()) == + nona.values.argmin()) + + # all NaNs + allna = string_series * np.nan + assert pd.isna(allna.idxmin()) + + # datetime64[ns] + s = Series(pd.date_range('20130102', periods=6)) + result = s.idxmin() + assert result == 0 + + s[0] = np.nan + result = s.idxmin() + assert result == 1 + + def test_idxmax(self): + # test idxmax + # _check_stat_op approach can not be used here because of isna check. + string_series = tm.makeStringSeries().rename('series') + + # add some NaNs + string_series[5:15] = np.NaN + + # skipna or no + assert string_series[string_series.idxmax()] == string_series.max() + assert pd.isna(string_series.idxmax(skipna=False)) + + # no NaNs + nona = string_series.dropna() + assert nona[nona.idxmax()] == nona.max() + assert (nona.index.values.tolist().index(nona.idxmax()) == + nona.values.argmax()) + + # all NaNs + allna = string_series * np.nan + assert pd.isna(allna.idxmax()) + + from pandas import date_range + s = Series(date_range('20130102', periods=6)) + result = s.idxmax() + assert result == 5 + + s[5] = np.nan + result = s.idxmax() + assert result == 4 + + # Float64Index + # GH#5914 + s = pd.Series([1, 2, 3], [1.1, 2.1, 3.1]) + result = s.idxmax() + assert result == 3.1 + result = s.idxmin() + assert result == 1.1 + + s = pd.Series(s.index, s.index) + result = s.idxmax() + assert result == 3.1 + result = s.idxmin() + assert result == 1.1 + + def test_all_any(self): + ts = tm.makeTimeSeries() + bool_series = ts > 0 + assert not bool_series.all() + assert bool_series.any() + + # Alternative types, with implicit 'object' dtype. + s = Series(['abc', True]) + assert 'abc' == s.any() # 'abc' || True => 'abc' + + def test_all_any_params(self): + # Check skipna, with implicit 'object' dtype. + s1 = Series([np.nan, True]) + s2 = Series([np.nan, False]) + assert s1.all(skipna=False) # nan && True => True + assert s1.all(skipna=True) + assert np.isnan(s2.any(skipna=False)) # nan || False => nan + assert not s2.any(skipna=True) + + # Check level. + s = pd.Series([False, False, True, True, False, True], + index=[0, 0, 1, 1, 2, 2]) + tm.assert_series_equal(s.all(level=0), Series([False, True, False])) + tm.assert_series_equal(s.any(level=0), Series([False, True, True])) + + # bool_only is not implemented with level option. + with pytest.raises(NotImplementedError): + s.any(bool_only=True, level=0) + with pytest.raises(NotImplementedError): + s.all(bool_only=True, level=0) + + # bool_only is not implemented alone. + with pytest.raises(NotImplementedError): + s.any(bool_only=True,) + with pytest.raises(NotImplementedError): + s.all(bool_only=True) + + def test_timedelta64_analytics(self): + + # index min/max + dti = pd.date_range('2012-1-1', periods=3, freq='D') + td = Series(dti) - pd.Timestamp('20120101') + + result = td.idxmin() + assert result == 0 + + result = td.idxmax() + assert result == 2 + + # GH#2982 + # with NaT + td[0] = np.nan + + result = td.idxmin() + assert result == 1 + + result = td.idxmax() + assert result == 2 + + # abs + s1 = Series(pd.date_range('20120101', periods=3)) + s2 = Series(pd.date_range('20120102', periods=3)) + expected = Series(s2 - s1) + + # FIXME: don't leave commented-out code + # this fails as numpy returns timedelta64[us] + # result = np.abs(s1-s2) + # assert_frame_equal(result,expected) + + result = (s1 - s2).abs() + tm.assert_series_equal(result, expected) + + # max/min + result = td.max() + expected = pd.Timedelta('2 days') + assert result == expected + + result = td.min() + expected = pd.Timedelta('1 days') + assert result == expected + + @pytest.mark.parametrize( + "test_input,error_type", + [ + (pd.Series([]), ValueError), + + # For strings, or any Series with dtype 'O' + (pd.Series(['foo', 'bar', 'baz']), TypeError), + (pd.Series([(1,), (2,)]), TypeError), + + # For mixed data types + ( + pd.Series(['foo', 'foo', 'bar', 'bar', None, np.nan, 'baz']), + TypeError + ), + ] + ) + def test_assert_idxminmax_raises(self, test_input, error_type): + """ + Cases where ``Series.argmax`` and related should raise an exception + """ + with pytest.raises(error_type): + test_input.idxmin() + with pytest.raises(error_type): + test_input.idxmin(skipna=False) + with pytest.raises(error_type): + test_input.idxmax() + with pytest.raises(error_type): + test_input.idxmax(skipna=False) + + def test_idxminmax_with_inf(self): + # For numeric data with NA and Inf (GH #13595) + s = pd.Series([0, -np.inf, np.inf, np.nan]) + + assert s.idxmin() == 1 + assert np.isnan(s.idxmin(skipna=False)) + + assert s.idxmax() == 2 + assert np.isnan(s.idxmax(skipna=False)) + + # Using old-style behavior that treats floating point nan, -inf, and + # +inf as missing + with pd.option_context('mode.use_inf_as_na', True): + assert s.idxmin() == 0 + assert np.isnan(s.idxmin(skipna=False)) + assert s.idxmax() == 0 + np.isnan(s.idxmax(skipna=False)) + + +class TestDatetime64SeriesReductions(object): + # Note: the name TestDatetime64SeriesReductions indicates these tests + # were moved from a series-specific test file, _not_ that these tests are + # intended long-term to be series-specific + + @pytest.mark.parametrize('nat_ser', [ + Series([pd.NaT, pd.NaT]), + Series([pd.NaT, pd.Timedelta('nat')]), + Series([pd.Timedelta('nat'), pd.Timedelta('nat')])]) + def test_minmax_nat_series(self, nat_ser): + # GH#23282 + assert nat_ser.min() is pd.NaT + assert nat_ser.max() is pd.NaT + + @pytest.mark.parametrize('nat_df', [ + pd.DataFrame([pd.NaT, pd.NaT]), + pd.DataFrame([pd.NaT, pd.Timedelta('nat')]), + pd.DataFrame([pd.Timedelta('nat'), pd.Timedelta('nat')])]) + def test_minmax_nat_dataframe(self, nat_df): + # GH#23282 + assert nat_df.min()[0] is pd.NaT + assert nat_df.max()[0] is pd.NaT + + def test_min_max(self): + rng = pd.date_range('1/1/2000', '12/31/2000') + rng2 = rng.take(np.random.permutation(len(rng))) + + the_min = rng2.min() + the_max = rng2.max() + assert isinstance(the_min, pd.Timestamp) + assert isinstance(the_max, pd.Timestamp) + assert the_min == rng[0] + assert the_max == rng[-1] + + assert rng.min() == rng[0] + assert rng.max() == rng[-1] + + def test_min_max_series(self): + rng = pd.date_range('1/1/2000', periods=10, freq='4h') + lvls = ['A', 'A', 'A', 'B', 'B', 'B', 'C', 'C', 'C', 'C'] + df = DataFrame({'TS': rng, 'V': np.random.randn(len(rng)), 'L': lvls}) + + result = df.TS.max() + exp = pd.Timestamp(df.TS.iat[-1]) + assert isinstance(result, pd.Timestamp) + assert result == exp + + result = df.TS.min() + exp = pd.Timestamp(df.TS.iat[0]) + assert isinstance(result, pd.Timestamp) + assert result == exp + + +class TestCategoricalSeriesReductions(object): + # Note: the name TestCategoricalSeriesReductions indicates these tests + # were moved from a series-specific test file, _not_ that these tests are + # intended long-term to be series-specific + + def test_min_max(self): + # unordered cats have no min/max + cat = Series(Categorical(["a", "b", "c", "d"], ordered=False)) + with pytest.raises(TypeError): + cat.min() + with pytest.raises(TypeError): + cat.max() + + cat = Series(Categorical(["a", "b", "c", "d"], ordered=True)) + _min = cat.min() + _max = cat.max() + assert _min == "a" + assert _max == "d" + + cat = Series(Categorical(["a", "b", "c", "d"], categories=[ + 'd', 'c', 'b', 'a'], ordered=True)) + _min = cat.min() + _max = cat.max() + assert _min == "d" + assert _max == "a" + + cat = Series(Categorical( + [np.nan, "b", "c", np.nan], categories=['d', 'c', 'b', 'a' + ], ordered=True)) + _min = cat.min() + _max = cat.max() + assert np.isnan(_min) + assert _max == "b" + + cat = Series(Categorical( + [np.nan, 1, 2, np.nan], categories=[5, 4, 3, 2, 1], ordered=True)) + _min = cat.min() + _max = cat.max() + assert np.isnan(_min) + assert _max == 1 + + +class TestSeriesMode(object): + # Note: the name TestSeriesMode indicates these tests + # were moved from a series-specific test file, _not_ that these tests are + # intended long-term to be series-specific + + @pytest.mark.parametrize('dropna, expected', [ + (True, Series([], dtype=np.float64)), + (False, Series([], dtype=np.float64)) + ]) + def test_mode_empty(self, dropna, expected): + s = Series([], dtype=np.float64) + result = s.mode(dropna) + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize('dropna, data, expected', [ + (True, [1, 1, 1, 2], [1]), + (True, [1, 1, 1, 2, 3, 3, 3], [1, 3]), + (False, [1, 1, 1, 2], [1]), + (False, [1, 1, 1, 2, 3, 3, 3], [1, 3]), + ]) + @pytest.mark.parametrize( + 'dt', + list(np.typecodes['AllInteger'] + np.typecodes['Float']) + ) + def test_mode_numerical(self, dropna, data, expected, dt): + s = Series(data, dtype=dt) + result = s.mode(dropna) + expected = Series(expected, dtype=dt) + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize('dropna, expected', [ + (True, [1.0]), + (False, [1, np.nan]), + ]) + def test_mode_numerical_nan(self, dropna, expected): + s = Series([1, 1, 2, np.nan, np.nan]) + result = s.mode(dropna) + expected = Series(expected) + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize('dropna, expected1, expected2, expected3', [ + (True, ['b'], ['bar'], ['nan']), + (False, ['b'], [np.nan], ['nan']) + ]) + def test_mode_str_obj(self, dropna, expected1, expected2, expected3): + # Test string and object types. + data = ['a'] * 2 + ['b'] * 3 + + s = Series(data, dtype='c') + result = s.mode(dropna) + expected1 = Series(expected1, dtype='c') + tm.assert_series_equal(result, expected1) + + data = ['foo', 'bar', 'bar', np.nan, np.nan, np.nan] + + s = Series(data, dtype=object) + result = s.mode(dropna) + expected2 = Series(expected2, dtype=object) + tm.assert_series_equal(result, expected2) + + data = ['foo', 'bar', 'bar', np.nan, np.nan, np.nan] + + s = Series(data, dtype=object).astype(str) + result = s.mode(dropna) + expected3 = Series(expected3, dtype=str) + tm.assert_series_equal(result, expected3) + + @pytest.mark.parametrize('dropna, expected1, expected2', [ + (True, ['foo'], ['foo']), + (False, ['foo'], [np.nan]) + ]) + def test_mode_mixeddtype(self, dropna, expected1, expected2): + s = Series([1, 'foo', 'foo']) + result = s.mode(dropna) + expected = Series(expected1) + tm.assert_series_equal(result, expected) + + s = Series([1, 'foo', 'foo', np.nan, np.nan, np.nan]) + result = s.mode(dropna) + expected = Series(expected2, dtype=object) + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize('dropna, expected1, expected2', [ + (True, ['1900-05-03', '2011-01-03', '2013-01-02'], + ['2011-01-03', '2013-01-02']), + (False, [np.nan], [np.nan, '2011-01-03', '2013-01-02']), + ]) + def test_mode_datetime(self, dropna, expected1, expected2): + s = Series(['2011-01-03', '2013-01-02', + '1900-05-03', 'nan', 'nan'], dtype='M8[ns]') + result = s.mode(dropna) + expected1 = Series(expected1, dtype='M8[ns]') + tm.assert_series_equal(result, expected1) + + s = Series(['2011-01-03', '2013-01-02', '1900-05-03', + '2011-01-03', '2013-01-02', 'nan', 'nan'], + dtype='M8[ns]') + result = s.mode(dropna) + expected2 = Series(expected2, dtype='M8[ns]') + tm.assert_series_equal(result, expected2) + + @pytest.mark.parametrize('dropna, expected1, expected2', [ + (True, ['-1 days', '0 days', '1 days'], ['2 min', '1 day']), + (False, [np.nan], [np.nan, '2 min', '1 day']), + ]) + def test_mode_timedelta(self, dropna, expected1, expected2): + # gh-5986: Test timedelta types. + + s = Series(['1 days', '-1 days', '0 days', 'nan', 'nan'], + dtype='timedelta64[ns]') + result = s.mode(dropna) + expected1 = Series(expected1, dtype='timedelta64[ns]') + tm.assert_series_equal(result, expected1) + + s = Series(['1 day', '1 day', '-1 day', '-1 day 2 min', + '2 min', '2 min', 'nan', 'nan'], + dtype='timedelta64[ns]') + result = s.mode(dropna) + expected2 = Series(expected2, dtype='timedelta64[ns]') + tm.assert_series_equal(result, expected2) + + @pytest.mark.parametrize('dropna, expected1, expected2, expected3', [ + (True, Categorical([1, 2], categories=[1, 2]), + Categorical(['a'], categories=[1, 'a']), + Categorical([3, 1], categories=[3, 2, 1], ordered=True)), + (False, Categorical([np.nan], categories=[1, 2]), + Categorical([np.nan, 'a'], categories=[1, 'a']), + Categorical([np.nan, 3, 1], categories=[3, 2, 1], ordered=True)), + ]) + def test_mode_category(self, dropna, expected1, expected2, expected3): + s = Series(Categorical([1, 2, np.nan, np.nan])) + result = s.mode(dropna) + expected1 = Series(expected1, dtype='category') + tm.assert_series_equal(result, expected1) + + s = Series(Categorical([1, 'a', 'a', np.nan, np.nan])) + result = s.mode(dropna) + expected2 = Series(expected2, dtype='category') + tm.assert_series_equal(result, expected2) + + s = Series(Categorical([1, 1, 2, 3, 3, np.nan, np.nan], + categories=[3, 2, 1], ordered=True)) + result = s.mode(dropna) + expected3 = Series(expected3, dtype='category') + tm.assert_series_equal(result, expected3) + + @pytest.mark.parametrize('dropna, expected1, expected2', [ + (True, [2**63], [1, 2**63]), + (False, [2**63], [1, 2**63]) + ]) + def test_mode_intoverflow(self, dropna, expected1, expected2): + # Test for uint64 overflow. + s = Series([1, 2**63, 2**63], dtype=np.uint64) + result = s.mode(dropna) + expected1 = Series(expected1, dtype=np.uint64) + tm.assert_series_equal(result, expected1) + + s = Series([1, 2**63], dtype=np.uint64) + result = s.mode(dropna) + expected2 = Series(expected2, dtype=np.uint64) + tm.assert_series_equal(result, expected2) + + @pytest.mark.skipif(not compat.PY3, reason="only PY3") + def test_mode_sortwarning(self): + # Check for the warning that is raised when the mode + # results cannot be sorted + + expected = Series(['foo', np.nan]) + s = Series([1, 'foo', 'foo', np.nan, np.nan]) + + with tm.assert_produces_warning(UserWarning, check_stacklevel=False): + result = s.mode(dropna=False) + result = result.sort_values().reset_index(drop=True) + + tm.assert_series_equal(result, expected) diff --git a/pandas/tests/reductions/test_stat_reductions.py b/pandas/tests/reductions/test_stat_reductions.py new file mode 100644 index 0000000000000..1146e0793d4f5 --- /dev/null +++ b/pandas/tests/reductions/test_stat_reductions.py @@ -0,0 +1,202 @@ +# -*- coding: utf-8 -*- +""" +Tests for statistical reductions of 2nd moment or higher: var, skew, kurt, ... +""" + +import numpy as np +import pytest + +from pandas.compat import lrange +import pandas.util._test_decorators as td + +import pandas as pd +from pandas import DataFrame, Series, compat +import pandas.util.testing as tm + + +class TestSeriesStatReductions(object): + # Note: the name TestSeriesStatReductions indicates these tests + # were moved from a series-specific test file, _not_ that these tests are + # intended long-term to be series-specific + + def _check_stat_op(self, name, alternate, string_series_, + check_objects=False, check_allna=False): + + with pd.option_context('use_bottleneck', False): + f = getattr(Series, name) + + # add some NaNs + string_series_[5:15] = np.NaN + + # idxmax, idxmin, min, and max are valid for dates + if name not in ['max', 'min']: + ds = Series(pd.date_range('1/1/2001', periods=10)) + with pytest.raises(TypeError): + f(ds) + + # skipna or no + assert pd.notna(f(string_series_)) + assert pd.isna(f(string_series_, skipna=False)) + + # check the result is correct + nona = string_series_.dropna() + tm.assert_almost_equal(f(nona), alternate(nona.values)) + tm.assert_almost_equal(f(string_series_), alternate(nona.values)) + + allna = string_series_ * np.nan + + if check_allna: + assert np.isnan(f(allna)) + + # dtype=object with None, it works! + s = Series([1, 2, 3, None, 5]) + f(s) + + # GH#2888 + items = [0] + items.extend(lrange(2 ** 40, 2 ** 40 + 1000)) + s = Series(items, dtype='int64') + tm.assert_almost_equal(float(f(s)), float(alternate(s.values))) + + # check date range + if check_objects: + s = Series(pd.bdate_range('1/1/2000', periods=10)) + res = f(s) + exp = alternate(s) + assert res == exp + + # check on string data + if name not in ['sum', 'min', 'max']: + with pytest.raises(TypeError): + f(Series(list('abc'))) + + # Invalid axis. + with pytest.raises(ValueError): + f(string_series_, axis=1) + + # Unimplemented numeric_only parameter. + if 'numeric_only' in compat.signature(f).args: + with pytest.raises(NotImplementedError, match=name): + f(string_series_, numeric_only=True) + + def test_sum(self): + string_series = tm.makeStringSeries().rename('series') + self._check_stat_op('sum', np.sum, string_series, check_allna=False) + + def test_mean(self): + string_series = tm.makeStringSeries().rename('series') + self._check_stat_op('mean', np.mean, string_series) + + def test_median(self): + string_series = tm.makeStringSeries().rename('series') + self._check_stat_op('median', np.median, string_series) + + # test with integers, test failure + int_ts = Series(np.ones(10, dtype=int), index=lrange(10)) + tm.assert_almost_equal(np.median(int_ts), int_ts.median()) + + def test_prod(self): + string_series = tm.makeStringSeries().rename('series') + self._check_stat_op('prod', np.prod, string_series) + + def test_min(self): + string_series = tm.makeStringSeries().rename('series') + self._check_stat_op('min', np.min, string_series, check_objects=True) + + def test_max(self): + string_series = tm.makeStringSeries().rename('series') + self._check_stat_op('max', np.max, string_series, check_objects=True) + + def test_var_std(self): + string_series = tm.makeStringSeries().rename('series') + datetime_series = tm.makeTimeSeries().rename('ts') + + alt = lambda x: np.std(x, ddof=1) + self._check_stat_op('std', alt, string_series) + + alt = lambda x: np.var(x, ddof=1) + self._check_stat_op('var', alt, string_series) + + result = datetime_series.std(ddof=4) + expected = np.std(datetime_series.values, ddof=4) + tm.assert_almost_equal(result, expected) + + result = datetime_series.var(ddof=4) + expected = np.var(datetime_series.values, ddof=4) + tm.assert_almost_equal(result, expected) + + # 1 - element series with ddof=1 + s = datetime_series.iloc[[0]] + result = s.var(ddof=1) + assert pd.isna(result) + + result = s.std(ddof=1) + assert pd.isna(result) + + def test_sem(self): + string_series = tm.makeStringSeries().rename('series') + datetime_series = tm.makeTimeSeries().rename('ts') + + alt = lambda x: np.std(x, ddof=1) / np.sqrt(len(x)) + self._check_stat_op('sem', alt, string_series) + + result = datetime_series.sem(ddof=4) + expected = np.std(datetime_series.values, + ddof=4) / np.sqrt(len(datetime_series.values)) + tm.assert_almost_equal(result, expected) + + # 1 - element series with ddof=1 + s = datetime_series.iloc[[0]] + result = s.sem(ddof=1) + assert pd.isna(result) + + @td.skip_if_no_scipy + def test_skew(self): + from scipy.stats import skew + + string_series = tm.makeStringSeries().rename('series') + + alt = lambda x: skew(x, bias=False) + self._check_stat_op('skew', alt, string_series) + + # test corner cases, skew() returns NaN unless there's at least 3 + # values + min_N = 3 + for i in range(1, min_N + 1): + s = Series(np.ones(i)) + df = DataFrame(np.ones((i, i))) + if i < min_N: + assert np.isnan(s.skew()) + assert np.isnan(df.skew()).all() + else: + assert 0 == s.skew() + assert (df.skew() == 0).all() + + @td.skip_if_no_scipy + def test_kurt(self): + from scipy.stats import kurtosis + + string_series = tm.makeStringSeries().rename('series') + + alt = lambda x: kurtosis(x, bias=False) + self._check_stat_op('kurt', alt, string_series) + + index = pd.MultiIndex( + levels=[['bar'], ['one', 'two', 'three'], [0, 1]], + codes=[[0, 0, 0, 0, 0, 0], [0, 1, 2, 0, 1, 2], [0, 1, 0, 1, 0, 1]] + ) + s = Series(np.random.randn(6), index=index) + tm.assert_almost_equal(s.kurt(), s.kurt(level=0)['bar']) + + # test corner cases, kurt() returns NaN unless there's at least 4 + # values + min_N = 4 + for i in range(1, min_N + 1): + s = Series(np.ones(i)) + df = DataFrame(np.ones((i, i))) + if i < min_N: + assert np.isnan(s.kurt()) + assert np.isnan(df.kurt()).all() + else: + assert 0 == s.kurt() + assert (df.kurt() == 0).all() diff --git a/pandas/tests/series/test_analytics.py b/pandas/tests/series/test_analytics.py index 73c6ea67ee8aa..b5140a5319c01 100644 --- a/pandas/tests/series/test_analytics.py +++ b/pandas/tests/series/test_analytics.py @@ -14,13 +14,11 @@ import pandas as pd from pandas import ( - Categorical, CategoricalIndex, DataFrame, Series, bdate_range, compat, - date_range, isna, notna) + Categorical, CategoricalIndex, DataFrame, Series, compat, date_range, isna, + notna) from pandas.api.types import is_scalar from pandas.core.index import MultiIndex from pandas.core.indexes.datetimes import Timestamp -from pandas.core.indexes.timedeltas import Timedelta -import pandas.core.nanops as nanops import pandas.util.testing as tm from pandas.util.testing import ( assert_almost_equal, assert_frame_equal, assert_index_equal, @@ -29,292 +27,6 @@ class TestSeriesAnalytics(object): - @pytest.mark.parametrize("use_bottleneck", [True, False]) - @pytest.mark.parametrize("method, unit", [ - ("sum", 0.0), - ("prod", 1.0) - ]) - def test_empty(self, method, unit, use_bottleneck): - with pd.option_context("use_bottleneck", use_bottleneck): - # GH 9422 / 18921 - # Entirely empty - s = Series([]) - # NA by default - result = getattr(s, method)() - assert result == unit - - # Explicit - result = getattr(s, method)(min_count=0) - assert result == unit - - result = getattr(s, method)(min_count=1) - assert isna(result) - - # Skipna, default - result = getattr(s, method)(skipna=True) - result == unit - - # Skipna, explicit - result = getattr(s, method)(skipna=True, min_count=0) - assert result == unit - - result = getattr(s, method)(skipna=True, min_count=1) - assert isna(result) - - # All-NA - s = Series([np.nan]) - # NA by default - result = getattr(s, method)() - assert result == unit - - # Explicit - result = getattr(s, method)(min_count=0) - assert result == unit - - result = getattr(s, method)(min_count=1) - assert isna(result) - - # Skipna, default - result = getattr(s, method)(skipna=True) - result == unit - - # skipna, explicit - result = getattr(s, method)(skipna=True, min_count=0) - assert result == unit - - result = getattr(s, method)(skipna=True, min_count=1) - assert isna(result) - - # Mix of valid, empty - s = Series([np.nan, 1]) - # Default - result = getattr(s, method)() - assert result == 1.0 - - # Explicit - result = getattr(s, method)(min_count=0) - assert result == 1.0 - - result = getattr(s, method)(min_count=1) - assert result == 1.0 - - # Skipna - result = getattr(s, method)(skipna=True) - assert result == 1.0 - - result = getattr(s, method)(skipna=True, min_count=0) - assert result == 1.0 - - result = getattr(s, method)(skipna=True, min_count=1) - assert result == 1.0 - - # GH #844 (changed in 9422) - df = DataFrame(np.empty((10, 0))) - assert (getattr(df, method)(1) == unit).all() - - s = pd.Series([1]) - result = getattr(s, method)(min_count=2) - assert isna(result) - - s = pd.Series([np.nan]) - result = getattr(s, method)(min_count=2) - assert isna(result) - - s = pd.Series([np.nan, 1]) - result = getattr(s, method)(min_count=2) - assert isna(result) - - @pytest.mark.parametrize('method, unit', [ - ('sum', 0.0), - ('prod', 1.0), - ]) - def test_empty_multi(self, method, unit): - s = pd.Series([1, np.nan, np.nan, np.nan], - index=pd.MultiIndex.from_product([('a', 'b'), (0, 1)])) - # 1 / 0 by default - result = getattr(s, method)(level=0) - expected = pd.Series([1, unit], index=['a', 'b']) - tm.assert_series_equal(result, expected) - - # min_count=0 - result = getattr(s, method)(level=0, min_count=0) - expected = pd.Series([1, unit], index=['a', 'b']) - tm.assert_series_equal(result, expected) - - # min_count=1 - result = getattr(s, method)(level=0, min_count=1) - expected = pd.Series([1, np.nan], index=['a', 'b']) - tm.assert_series_equal(result, expected) - - @pytest.mark.parametrize( - "method", ['mean', 'median', 'std', 'var']) - def test_ops_consistency_on_empty(self, method): - - # GH 7869 - # consistency on empty - - # float - result = getattr(Series(dtype=float), method)() - assert isna(result) - - # timedelta64[ns] - result = getattr(Series(dtype='m8[ns]'), method)() - assert result is pd.NaT - - def test_nansum_buglet(self): - s = Series([1.0, np.nan], index=[0, 1]) - result = np.nansum(s) - assert_almost_equal(result, 1) - - @pytest.mark.parametrize("use_bottleneck", [True, False]) - def test_sum_overflow(self, use_bottleneck): - - with pd.option_context('use_bottleneck', use_bottleneck): - # GH 6915 - # overflowing on the smaller int dtypes - for dtype in ['int32', 'int64']: - v = np.arange(5000000, dtype=dtype) - s = Series(v) - - result = s.sum(skipna=False) - assert int(result) == v.sum(dtype='int64') - result = s.min(skipna=False) - assert int(result) == 0 - result = s.max(skipna=False) - assert int(result) == v[-1] - - for dtype in ['float32', 'float64']: - v = np.arange(5000000, dtype=dtype) - s = Series(v) - - result = s.sum(skipna=False) - assert result == v.sum(dtype=dtype) - result = s.min(skipna=False) - assert np.allclose(float(result), 0.0) - result = s.max(skipna=False) - assert np.allclose(float(result), v[-1]) - - def test_sum(self, string_series): - self._check_stat_op('sum', np.sum, string_series, check_allna=False) - - def test_sum_inf(self): - s = Series(np.random.randn(10)) - s2 = s.copy() - - s[5:8] = np.inf - s2[5:8] = np.nan - - assert np.isinf(s.sum()) - - arr = np.random.randn(100, 100).astype('f4') - arr[:, 2] = np.inf - - with pd.option_context("mode.use_inf_as_na", True): - assert_almost_equal(s.sum(), s2.sum()) - - res = nanops.nansum(arr, axis=1) - assert np.isinf(res).all() - - def test_mean(self, string_series): - self._check_stat_op('mean', np.mean, string_series) - - def test_median(self, string_series): - self._check_stat_op('median', np.median, string_series) - - # test with integers, test failure - int_ts = Series(np.ones(10, dtype=int), index=lrange(10)) - tm.assert_almost_equal(np.median(int_ts), int_ts.median()) - - def test_prod(self, string_series): - self._check_stat_op('prod', np.prod, string_series) - - def test_min(self, string_series): - self._check_stat_op('min', np.min, string_series, check_objects=True) - - def test_max(self, string_series): - self._check_stat_op('max', np.max, string_series, check_objects=True) - - def test_var_std(self, datetime_series, string_series): - alt = lambda x: np.std(x, ddof=1) - self._check_stat_op('std', alt, string_series) - - alt = lambda x: np.var(x, ddof=1) - self._check_stat_op('var', alt, string_series) - - result = datetime_series.std(ddof=4) - expected = np.std(datetime_series.values, ddof=4) - assert_almost_equal(result, expected) - - result = datetime_series.var(ddof=4) - expected = np.var(datetime_series.values, ddof=4) - assert_almost_equal(result, expected) - - # 1 - element series with ddof=1 - s = datetime_series.iloc[[0]] - result = s.var(ddof=1) - assert isna(result) - - result = s.std(ddof=1) - assert isna(result) - - def test_sem(self, datetime_series, string_series): - alt = lambda x: np.std(x, ddof=1) / np.sqrt(len(x)) - self._check_stat_op('sem', alt, string_series) - - result = datetime_series.sem(ddof=4) - expected = np.std(datetime_series.values, - ddof=4) / np.sqrt(len(datetime_series.values)) - assert_almost_equal(result, expected) - - # 1 - element series with ddof=1 - s = datetime_series.iloc[[0]] - result = s.sem(ddof=1) - assert isna(result) - - @td.skip_if_no_scipy - def test_skew(self, string_series): - from scipy.stats import skew - alt = lambda x: skew(x, bias=False) - self._check_stat_op('skew', alt, string_series) - - # test corner cases, skew() returns NaN unless there's at least 3 - # values - min_N = 3 - for i in range(1, min_N + 1): - s = Series(np.ones(i)) - df = DataFrame(np.ones((i, i))) - if i < min_N: - assert np.isnan(s.skew()) - assert np.isnan(df.skew()).all() - else: - assert 0 == s.skew() - assert (df.skew() == 0).all() - - @td.skip_if_no_scipy - def test_kurt(self, string_series): - from scipy.stats import kurtosis - alt = lambda x: kurtosis(x, bias=False) - self._check_stat_op('kurt', alt, string_series) - - index = MultiIndex(levels=[['bar'], ['one', 'two', 'three'], [0, 1]], - codes=[[0, 0, 0, 0, 0, 0], [0, 1, 2, 0, 1, 2], - [0, 1, 0, 1, 0, 1]]) - s = Series(np.random.randn(6), index=index) - tm.assert_almost_equal(s.kurt(), s.kurt(level=0)['bar']) - - # test corner cases, kurt() returns NaN unless there's at least 4 - # values - min_N = 4 - for i in range(1, min_N + 1): - s = Series(np.ones(i)) - df = DataFrame(np.ones((i, i))) - if i < min_N: - assert np.isnan(s.kurt()) - assert np.isnan(df.kurt()).all() - else: - assert 0 == s.kurt() - assert (df.kurt() == 0).all() - def test_describe(self): s = Series([0, 1, 2, 3, 4], name='int_data') result = s.describe() @@ -508,63 +220,6 @@ def test_npdiff(self): r = np.diff(s) assert_series_equal(Series([nan, 0, 0, 0, nan]), r) - def _check_stat_op(self, name, alternate, string_series_, - check_objects=False, check_allna=False): - - with pd.option_context('use_bottleneck', False): - f = getattr(Series, name) - - # add some NaNs - string_series_[5:15] = np.NaN - - # idxmax, idxmin, min, and max are valid for dates - if name not in ['max', 'min']: - ds = Series(date_range('1/1/2001', periods=10)) - pytest.raises(TypeError, f, ds) - - # skipna or no - assert notna(f(string_series_)) - assert isna(f(string_series_, skipna=False)) - - # check the result is correct - nona = string_series_.dropna() - assert_almost_equal(f(nona), alternate(nona.values)) - assert_almost_equal(f(string_series_), alternate(nona.values)) - - allna = string_series_ * nan - - if check_allna: - assert np.isnan(f(allna)) - - # dtype=object with None, it works! - s = Series([1, 2, 3, None, 5]) - f(s) - - # 2888 - items = [0] - items.extend(lrange(2 ** 40, 2 ** 40 + 1000)) - s = Series(items, dtype='int64') - assert_almost_equal(float(f(s)), float(alternate(s.values))) - - # check date range - if check_objects: - s = Series(bdate_range('1/1/2000', periods=10)) - res = f(s) - exp = alternate(s) - assert res == exp - - # check on string data - if name not in ['sum', 'min', 'max']: - pytest.raises(TypeError, f, Series(list('abc'))) - - # Invalid axis. - pytest.raises(ValueError, f, string_series_, axis=1) - - # Unimplemented numeric_only parameter. - if 'numeric_only' in compat.signature(f).args: - with pytest.raises(NotImplementedError, match=name): - f(string_series_, numeric_only=True) - def _check_accum_op(self, name, datetime_series_, check_dtype=True): func = getattr(np, name) tm.assert_numpy_array_equal(func(datetime_series_).values, @@ -649,39 +304,6 @@ def test_prod_numpy16_bug(self): assert not isinstance(result, Series) - def test_all_any(self): - ts = tm.makeTimeSeries() - bool_series = ts > 0 - assert not bool_series.all() - assert bool_series.any() - - # Alternative types, with implicit 'object' dtype. - s = Series(['abc', True]) - assert 'abc' == s.any() # 'abc' || True => 'abc' - - def test_all_any_params(self): - # Check skipna, with implicit 'object' dtype. - s1 = Series([np.nan, True]) - s2 = Series([np.nan, False]) - assert s1.all(skipna=False) # nan && True => True - assert s1.all(skipna=True) - assert np.isnan(s2.any(skipna=False)) # nan || False => nan - assert not s2.any(skipna=True) - - # Check level. - s = pd.Series([False, False, True, True, False, True], - index=[0, 0, 1, 1, 2, 2]) - assert_series_equal(s.all(level=0), Series([False, True, False])) - assert_series_equal(s.any(level=0), Series([False, True, True])) - - # bool_only is not implemented with level option. - pytest.raises(NotImplementedError, s.any, bool_only=True, level=0) - pytest.raises(NotImplementedError, s.all, bool_only=True, level=0) - - # bool_only is not implemented alone. - pytest.raises(NotImplementedError, s.any, bool_only=True) - pytest.raises(NotImplementedError, s.all, bool_only=True) - @td.skip_if_no_scipy def test_corr(self, datetime_series): import scipy.stats as stats @@ -1124,174 +746,6 @@ def test_isin_empty(self, empty): result = s.isin(empty) tm.assert_series_equal(expected, result) - def test_timedelta64_analytics(self): - from pandas import date_range - - # index min/max - td = Series(date_range('2012-1-1', periods=3, freq='D')) - \ - Timestamp('20120101') - - result = td.idxmin() - assert result == 0 - - result = td.idxmax() - assert result == 2 - - # GH 2982 - # with NaT - td[0] = np.nan - - result = td.idxmin() - assert result == 1 - - result = td.idxmax() - assert result == 2 - - # abs - s1 = Series(date_range('20120101', periods=3)) - s2 = Series(date_range('20120102', periods=3)) - expected = Series(s2 - s1) - - # this fails as numpy returns timedelta64[us] - # result = np.abs(s1-s2) - # assert_frame_equal(result,expected) - - result = (s1 - s2).abs() - assert_series_equal(result, expected) - - # max/min - result = td.max() - expected = Timedelta('2 days') - assert result == expected - - result = td.min() - expected = Timedelta('1 days') - assert result == expected - - def test_idxmin(self, string_series): - # test idxmin - # _check_stat_op approach can not be used here because of isna check. - - # add some NaNs - string_series[5:15] = np.NaN - - # skipna or no - assert string_series[string_series.idxmin()] == string_series.min() - assert isna(string_series.idxmin(skipna=False)) - - # no NaNs - nona = string_series.dropna() - assert nona[nona.idxmin()] == nona.min() - assert (nona.index.values.tolist().index(nona.idxmin()) == - nona.values.argmin()) - - # all NaNs - allna = string_series * nan - assert isna(allna.idxmin()) - - # datetime64[ns] - from pandas import date_range - s = Series(date_range('20130102', periods=6)) - result = s.idxmin() - assert result == 0 - - s[0] = np.nan - result = s.idxmin() - assert result == 1 - - def test_numpy_argmin_deprecated(self): - # See gh-16830 - data = np.arange(1, 11) - - s = Series(data, index=data) - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - # The deprecation of Series.argmin also causes a deprecation - # warning when calling np.argmin. This behavior is temporary - # until the implementation of Series.argmin is corrected. - result = np.argmin(s) - - assert result == 1 - - with tm.assert_produces_warning(FutureWarning): - # argmin is aliased to idxmin - result = s.argmin() - - assert result == 1 - - with tm.assert_produces_warning(FutureWarning, - check_stacklevel=False): - msg = "the 'out' parameter is not supported" - with pytest.raises(ValueError, match=msg): - np.argmin(s, out=data) - - def test_idxmax(self, string_series): - # test idxmax - # _check_stat_op approach can not be used here because of isna check. - - # add some NaNs - string_series[5:15] = np.NaN - - # skipna or no - assert string_series[string_series.idxmax()] == string_series.max() - assert isna(string_series.idxmax(skipna=False)) - - # no NaNs - nona = string_series.dropna() - assert nona[nona.idxmax()] == nona.max() - assert (nona.index.values.tolist().index(nona.idxmax()) == - nona.values.argmax()) - - # all NaNs - allna = string_series * nan - assert isna(allna.idxmax()) - - from pandas import date_range - s = Series(date_range('20130102', periods=6)) - result = s.idxmax() - assert result == 5 - - s[5] = np.nan - result = s.idxmax() - assert result == 4 - - # Float64Index - # GH 5914 - s = pd.Series([1, 2, 3], [1.1, 2.1, 3.1]) - result = s.idxmax() - assert result == 3.1 - result = s.idxmin() - assert result == 1.1 - - s = pd.Series(s.index, s.index) - result = s.idxmax() - assert result == 3.1 - result = s.idxmin() - assert result == 1.1 - - def test_numpy_argmax_deprecated(self): - # See gh-16830 - data = np.arange(1, 11) - - s = Series(data, index=data) - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - # The deprecation of Series.argmax also causes a deprecation - # warning when calling np.argmax. This behavior is temporary - # until the implementation of Series.argmax is corrected. - result = np.argmax(s) - assert result == 10 - - with tm.assert_produces_warning(FutureWarning): - # argmax is aliased to idxmax - result = s.argmax() - - assert result == 10 - - with tm.assert_produces_warning(FutureWarning, - check_stacklevel=False): - msg = "the 'out' parameter is not supported" - with pytest.raises(ValueError, match=msg): - np.argmax(s, out=data) - def test_ptp(self): # GH21614 N = 1000 @@ -1333,12 +787,6 @@ def test_ptp(self): check_stacklevel=False): s.ptp(numeric_only=True) - def test_empty_timeseries_redections_return_nat(self): - # covers #11245 - for dtype in ('m8[ns]', 'm8[ns]', 'M8[ns]', 'M8[ns, UTC]'): - assert Series([], dtype=dtype).min() is pd.NaT - assert Series([], dtype=dtype).max() is pd.NaT - def test_repeat(self): s = Series(np.random.randn(3), index=['a', 'b', 'c']) @@ -1735,180 +1183,6 @@ def s_main_dtypes_split(request, s_main_dtypes): return s_main_dtypes[request.param] -class TestMode(object): - - @pytest.mark.parametrize('dropna, expected', [ - (True, Series([], dtype=np.float64)), - (False, Series([], dtype=np.float64)) - ]) - def test_mode_empty(self, dropna, expected): - s = Series([], dtype=np.float64) - result = s.mode(dropna) - tm.assert_series_equal(result, expected) - - @pytest.mark.parametrize('dropna, data, expected', [ - (True, [1, 1, 1, 2], [1]), - (True, [1, 1, 1, 2, 3, 3, 3], [1, 3]), - (False, [1, 1, 1, 2], [1]), - (False, [1, 1, 1, 2, 3, 3, 3], [1, 3]), - ]) - @pytest.mark.parametrize( - 'dt', - list(np.typecodes['AllInteger'] + np.typecodes['Float']) - ) - def test_mode_numerical(self, dropna, data, expected, dt): - s = Series(data, dtype=dt) - result = s.mode(dropna) - expected = Series(expected, dtype=dt) - tm.assert_series_equal(result, expected) - - @pytest.mark.parametrize('dropna, expected', [ - (True, [1.0]), - (False, [1, np.nan]), - ]) - def test_mode_numerical_nan(self, dropna, expected): - s = Series([1, 1, 2, np.nan, np.nan]) - result = s.mode(dropna) - expected = Series(expected) - tm.assert_series_equal(result, expected) - - @pytest.mark.parametrize('dropna, expected1, expected2, expected3', [ - (True, ['b'], ['bar'], ['nan']), - (False, ['b'], [np.nan], ['nan']) - ]) - def test_mode_str_obj(self, dropna, expected1, expected2, expected3): - # Test string and object types. - data = ['a'] * 2 + ['b'] * 3 - - s = Series(data, dtype='c') - result = s.mode(dropna) - expected1 = Series(expected1, dtype='c') - tm.assert_series_equal(result, expected1) - - data = ['foo', 'bar', 'bar', np.nan, np.nan, np.nan] - - s = Series(data, dtype=object) - result = s.mode(dropna) - expected2 = Series(expected2, dtype=object) - tm.assert_series_equal(result, expected2) - - data = ['foo', 'bar', 'bar', np.nan, np.nan, np.nan] - - s = Series(data, dtype=object).astype(str) - result = s.mode(dropna) - expected3 = Series(expected3, dtype=str) - tm.assert_series_equal(result, expected3) - - @pytest.mark.parametrize('dropna, expected1, expected2', [ - (True, ['foo'], ['foo']), - (False, ['foo'], [np.nan]) - ]) - def test_mode_mixeddtype(self, dropna, expected1, expected2): - s = Series([1, 'foo', 'foo']) - result = s.mode(dropna) - expected = Series(expected1) - tm.assert_series_equal(result, expected) - - s = Series([1, 'foo', 'foo', np.nan, np.nan, np.nan]) - result = s.mode(dropna) - expected = Series(expected2, dtype=object) - tm.assert_series_equal(result, expected) - - @pytest.mark.parametrize('dropna, expected1, expected2', [ - (True, ['1900-05-03', '2011-01-03', '2013-01-02'], - ['2011-01-03', '2013-01-02']), - (False, [np.nan], [np.nan, '2011-01-03', '2013-01-02']), - ]) - def test_mode_datetime(self, dropna, expected1, expected2): - s = Series(['2011-01-03', '2013-01-02', - '1900-05-03', 'nan', 'nan'], dtype='M8[ns]') - result = s.mode(dropna) - expected1 = Series(expected1, dtype='M8[ns]') - tm.assert_series_equal(result, expected1) - - s = Series(['2011-01-03', '2013-01-02', '1900-05-03', - '2011-01-03', '2013-01-02', 'nan', 'nan'], - dtype='M8[ns]') - result = s.mode(dropna) - expected2 = Series(expected2, dtype='M8[ns]') - tm.assert_series_equal(result, expected2) - - @pytest.mark.parametrize('dropna, expected1, expected2', [ - (True, ['-1 days', '0 days', '1 days'], ['2 min', '1 day']), - (False, [np.nan], [np.nan, '2 min', '1 day']), - ]) - def test_mode_timedelta(self, dropna, expected1, expected2): - # gh-5986: Test timedelta types. - - s = Series(['1 days', '-1 days', '0 days', 'nan', 'nan'], - dtype='timedelta64[ns]') - result = s.mode(dropna) - expected1 = Series(expected1, dtype='timedelta64[ns]') - tm.assert_series_equal(result, expected1) - - s = Series(['1 day', '1 day', '-1 day', '-1 day 2 min', - '2 min', '2 min', 'nan', 'nan'], - dtype='timedelta64[ns]') - result = s.mode(dropna) - expected2 = Series(expected2, dtype='timedelta64[ns]') - tm.assert_series_equal(result, expected2) - - @pytest.mark.parametrize('dropna, expected1, expected2, expected3', [ - (True, Categorical([1, 2], categories=[1, 2]), - Categorical(['a'], categories=[1, 'a']), - Categorical([3, 1], categories=[3, 2, 1], ordered=True)), - (False, Categorical([np.nan], categories=[1, 2]), - Categorical([np.nan, 'a'], categories=[1, 'a']), - Categorical([np.nan, 3, 1], categories=[3, 2, 1], ordered=True)), - ]) - def test_mode_category(self, dropna, expected1, expected2, expected3): - s = Series(Categorical([1, 2, np.nan, np.nan])) - result = s.mode(dropna) - expected1 = Series(expected1, dtype='category') - tm.assert_series_equal(result, expected1) - - s = Series(Categorical([1, 'a', 'a', np.nan, np.nan])) - result = s.mode(dropna) - expected2 = Series(expected2, dtype='category') - tm.assert_series_equal(result, expected2) - - s = Series(Categorical([1, 1, 2, 3, 3, np.nan, np.nan], - categories=[3, 2, 1], ordered=True)) - result = s.mode(dropna) - expected3 = Series(expected3, dtype='category') - tm.assert_series_equal(result, expected3) - - @pytest.mark.parametrize('dropna, expected1, expected2', [ - (True, [2**63], [1, 2**63]), - (False, [2**63], [1, 2**63]) - ]) - def test_mode_intoverflow(self, dropna, expected1, expected2): - # Test for uint64 overflow. - s = Series([1, 2**63, 2**63], dtype=np.uint64) - result = s.mode(dropna) - expected1 = Series(expected1, dtype=np.uint64) - tm.assert_series_equal(result, expected1) - - s = Series([1, 2**63], dtype=np.uint64) - result = s.mode(dropna) - expected2 = Series(expected2, dtype=np.uint64) - tm.assert_series_equal(result, expected2) - - @pytest.mark.skipif(not compat.PY3, reason="only PY3") - def test_mode_sortwarning(self): - # Check for the warning that is raised when the mode - # results cannot be sorted - - expected = Series(['foo', np.nan]) - s = Series([1, 'foo', 'foo', np.nan, np.nan]) - - with tm.assert_produces_warning(UserWarning, check_stacklevel=False): - result = s.mode(dropna=False) - result = result.sort_values().reset_index(drop=True) - - tm.assert_series_equal(result, expected) - - def assert_check_nselect_boundary(vals, dtype, method): # helper function for 'test_boundary_{dtype}' tests s = Series(vals, dtype=dtype) @@ -2047,40 +1321,6 @@ def test_count(self): result = s.count() assert result == 2 - def test_min_max(self): - # unordered cats have no min/max - cat = Series(Categorical(["a", "b", "c", "d"], ordered=False)) - pytest.raises(TypeError, lambda: cat.min()) - pytest.raises(TypeError, lambda: cat.max()) - - cat = Series(Categorical(["a", "b", "c", "d"], ordered=True)) - _min = cat.min() - _max = cat.max() - assert _min == "a" - assert _max == "d" - - cat = Series(Categorical(["a", "b", "c", "d"], categories=[ - 'd', 'c', 'b', 'a'], ordered=True)) - _min = cat.min() - _max = cat.max() - assert _min == "d" - assert _max == "a" - - cat = Series(Categorical( - [np.nan, "b", "c", np.nan], categories=['d', 'c', 'b', 'a' - ], ordered=True)) - _min = cat.min() - _max = cat.max() - assert np.isnan(_min) - assert _max == "b" - - cat = Series(Categorical( - [np.nan, 1, 2, np.nan], categories=[5, 4, 3, 2, 1], ordered=True)) - _min = cat.min() - _max = cat.max() - assert np.isnan(_min) - assert _max == 1 - def test_value_counts(self): # GH 12835 cats = Categorical(list('abcccb'), categories=list('cabd')) diff --git a/pandas/tests/series/test_datetime_values.py b/pandas/tests/series/test_datetime_values.py index 0d617d5a26706..745a9eee6c300 100644 --- a/pandas/tests/series/test_datetime_values.py +++ b/pandas/tests/series/test_datetime_values.py @@ -531,24 +531,6 @@ def test_dt_timetz_accessor(self, tz_naive_fixture): result = s.dt.timetz tm.assert_series_equal(result, expected) - @pytest.mark.parametrize('nat', [ - pd.Series([pd.NaT, pd.NaT]), - pd.Series([pd.NaT, pd.Timedelta('nat')]), - pd.Series([pd.Timedelta('nat'), pd.Timedelta('nat')])]) - def test_minmax_nat_series(self, nat): - # GH 23282 - assert nat.min() is pd.NaT - assert nat.max() is pd.NaT - - @pytest.mark.parametrize('nat', [ - # GH 23282 - pd.DataFrame([pd.NaT, pd.NaT]), - pd.DataFrame([pd.NaT, pd.Timedelta('nat')]), - pd.DataFrame([pd.Timedelta('nat'), pd.Timedelta('nat')])]) - def test_minmax_nat_dataframe(self, nat): - assert nat.min()[0] is pd.NaT - assert nat.max()[0] is pd.NaT - def test_setitem_with_string_index(self): # GH 23451 x = pd.Series([1, 2, 3], index=['Date', 'b', 'other']) diff --git a/pandas/tests/series/test_operators.py b/pandas/tests/series/test_operators.py index 7c11880ae5f94..f6fb5f0c46cc8 100644 --- a/pandas/tests/series/test_operators.py +++ b/pandas/tests/series/test_operators.py @@ -750,53 +750,6 @@ def test_op_duplicate_index(self): expected = pd.Series([11, 12, np.nan], index=[1, 1, 2]) assert_series_equal(result, expected) - @pytest.mark.parametrize( - "test_input,error_type", - [ - (pd.Series([]), ValueError), - - # For strings, or any Series with dtype 'O' - (pd.Series(['foo', 'bar', 'baz']), TypeError), - (pd.Series([(1,), (2,)]), TypeError), - - # For mixed data types - ( - pd.Series(['foo', 'foo', 'bar', 'bar', None, np.nan, 'baz']), - TypeError - ), - ] - ) - def test_assert_idxminmax_raises(self, test_input, error_type): - """ - Cases where ``Series.argmax`` and related should raise an exception - """ - with pytest.raises(error_type): - test_input.idxmin() - with pytest.raises(error_type): - test_input.idxmin(skipna=False) - with pytest.raises(error_type): - test_input.idxmax() - with pytest.raises(error_type): - test_input.idxmax(skipna=False) - - def test_idxminmax_with_inf(self): - # For numeric data with NA and Inf (GH #13595) - s = pd.Series([0, -np.inf, np.inf, np.nan]) - - assert s.idxmin() == 1 - assert np.isnan(s.idxmin(skipna=False)) - - assert s.idxmax() == 2 - assert np.isnan(s.idxmax(skipna=False)) - - # Using old-style behavior that treats floating point nan, -inf, and - # +inf as missing - with pd.option_context('mode.use_inf_as_na', True): - assert s.idxmin() == 0 - assert np.isnan(s.idxmin(skipna=False)) - assert s.idxmax() == 0 - np.isnan(s.idxmax(skipna=False)) - class TestSeriesUnaryOps(object): # __neg__, __pos__, __inv__ diff --git a/pandas/tests/series/test_timeseries.py b/pandas/tests/series/test_timeseries.py index ce464184cd8d6..4f47c308c9a13 100644 --- a/pandas/tests/series/test_timeseries.py +++ b/pandas/tests/series/test_timeseries.py @@ -967,35 +967,6 @@ def test_setops_preserve_freq(self, tz): assert result.freq == rng.freq assert result.tz == rng.tz - def test_min_max(self): - rng = date_range('1/1/2000', '12/31/2000') - rng2 = rng.take(np.random.permutation(len(rng))) - - the_min = rng2.min() - the_max = rng2.max() - assert isinstance(the_min, Timestamp) - assert isinstance(the_max, Timestamp) - assert the_min == rng[0] - assert the_max == rng[-1] - - assert rng.min() == rng[0] - assert rng.max() == rng[-1] - - def test_min_max_series(self): - rng = date_range('1/1/2000', periods=10, freq='4h') - lvls = ['A', 'A', 'A', 'B', 'B', 'B', 'C', 'C', 'C', 'C'] - df = DataFrame({'TS': rng, 'V': np.random.randn(len(rng)), 'L': lvls}) - - result = df.TS.max() - exp = Timestamp(df.TS.iat[-1]) - assert isinstance(result, Timestamp) - assert result == exp - - result = df.TS.min() - exp = Timestamp(df.TS.iat[0]) - assert isinstance(result, Timestamp) - assert result == exp - def test_from_M8_structured(self): dates = [(datetime(2012, 9, 9, 0, 0), datetime(2012, 9, 8, 15, 10))] arr = np.array(dates, diff --git a/pandas/tests/test_base.py b/pandas/tests/test_base.py index 6eada0e89b506..91e1af5c8887c 100644 --- a/pandas/tests/test_base.py +++ b/pandas/tests/test_base.py @@ -15,7 +15,7 @@ import pandas.util.testing as tm from pandas import (Series, Index, DatetimeIndex, TimedeltaIndex, PeriodIndex, Timedelta, IntervalIndex, Interval, - CategoricalIndex, Timestamp) + CategoricalIndex, Timestamp, DataFrame, Panel) from pandas.compat import StringIO, PYPY, long from pandas.compat.numpy import np_array_datetime64_compat from pandas.core.accessor import PandasDelegate @@ -49,8 +49,8 @@ class CheckImmutable(object): def check_mutable_error(self, *args, **kwargs): # Pass whatever function you normally would to pytest.raises # (after the Exception kind). - pytest.raises( - TypeError, self.mutable_regex, *args, **kwargs) + with pytest.raises(TypeError): + self.mutable_regex(*args, **kwargs) def test_no_mutable_funcs(self): def setitem(): @@ -227,14 +227,15 @@ def check_ops_properties(self, props, filter=None, ignore_failures=False): # an object that is datetimelike will raise a TypeError, # otherwise an AttributeError + err = AttributeError if issubclass(type(o), DatetimeIndexOpsMixin): - pytest.raises(TypeError, lambda: getattr(o, op)) - else: - pytest.raises(AttributeError, - lambda: getattr(o, op)) + err = TypeError + + with pytest.raises(err): + getattr(o, op) - def test_binary_ops_docs(self): - from pandas import DataFrame, Panel + @pytest.mark.parametrize('klass', [Series, DataFrame, Panel]) + def test_binary_ops_docs(self, klass): op_map = {'add': '+', 'sub': '-', 'mul': '*', @@ -242,18 +243,16 @@ def test_binary_ops_docs(self): 'pow': '**', 'truediv': '/', 'floordiv': '//'} - for op_name in ['add', 'sub', 'mul', 'mod', 'pow', 'truediv', - 'floordiv']: - for klass in [Series, DataFrame, Panel]: - operand1 = klass.__name__.lower() - operand2 = 'other' - op = op_map[op_name] - expected_str = ' '.join([operand1, op, operand2]) - assert expected_str in getattr(klass, op_name).__doc__ + for op_name in op_map: + operand1 = klass.__name__.lower() + operand2 = 'other' + op = op_map[op_name] + expected_str = ' '.join([operand1, op, operand2]) + assert expected_str in getattr(klass, op_name).__doc__ - # reverse version of the binary ops - expected_str = ' '.join([operand2, op, operand1]) - assert expected_str in getattr(klass, 'r' + op_name).__doc__ + # reverse version of the binary ops + expected_str = ' '.join([operand2, op, operand1]) + assert expected_str in getattr(klass, 'r' + op_name).__doc__ class TestIndexOps(Ops): @@ -338,68 +337,6 @@ def test_ndarray_compat_properties(self): assert Index([1]).item() == 1 assert Series([1]).item() == 1 - def test_ops(self): - for op in ['max', 'min']: - for o in self.objs: - result = getattr(o, op)() - if not isinstance(o, PeriodIndex): - expected = getattr(o.values, op)() - else: - expected = pd.Period( - ordinal=getattr(o._ndarray_values, op)(), - freq=o.freq) - try: - assert result == expected - except TypeError: - # comparing tz-aware series with np.array results in - # TypeError - expected = expected.astype('M8[ns]').astype('int64') - assert result.value == expected - - def test_nanops(self): - # GH 7261 - for op in ['max', 'min']: - for klass in [Index, Series]: - - obj = klass([np.nan, 2.0]) - assert getattr(obj, op)() == 2.0 - - obj = klass([np.nan]) - assert pd.isna(getattr(obj, op)()) - - obj = klass([]) - assert pd.isna(getattr(obj, op)()) - - obj = klass([pd.NaT, datetime(2011, 11, 1)]) - # check DatetimeIndex monotonic path - assert getattr(obj, op)() == datetime(2011, 11, 1) - - obj = klass([pd.NaT, datetime(2011, 11, 1), pd.NaT]) - # check DatetimeIndex non-monotonic path - assert getattr(obj, op)(), datetime(2011, 11, 1) - - # argmin/max - obj = Index(np.arange(5, dtype='int64')) - assert obj.argmin() == 0 - assert obj.argmax() == 4 - - obj = Index([np.nan, 1, np.nan, 2]) - assert obj.argmin() == 1 - assert obj.argmax() == 3 - - obj = Index([np.nan]) - assert obj.argmin() == -1 - assert obj.argmax() == -1 - - obj = Index([pd.NaT, datetime(2011, 11, 1), datetime(2011, 11, 2), - pd.NaT]) - assert obj.argmin() == 1 - assert obj.argmax() == 2 - - obj = Index([pd.NaT]) - assert obj.argmin() == -1 - assert obj.argmax() == -1 - def test_value_counts_unique_nunique(self): for orig in self.objs: o = orig.copy() @@ -546,106 +483,105 @@ def test_value_counts_unique_nunique_null(self): assert o.nunique() == 8 assert o.nunique(dropna=False) == 9 - def test_value_counts_inferred(self): - klasses = [Index, Series] - for klass in klasses: - s_values = ['a', 'b', 'b', 'b', 'b', 'c', 'd', 'd', 'a', 'a'] - s = klass(s_values) - expected = Series([4, 3, 2, 1], index=['b', 'a', 'd', 'c']) - tm.assert_series_equal(s.value_counts(), expected) - - if isinstance(s, Index): - exp = Index(np.unique(np.array(s_values, dtype=np.object_))) - tm.assert_index_equal(s.unique(), exp) - else: - exp = np.unique(np.array(s_values, dtype=np.object_)) - tm.assert_numpy_array_equal(s.unique(), exp) - - assert s.nunique() == 4 - # don't sort, have to sort after the fact as not sorting is - # platform-dep - hist = s.value_counts(sort=False).sort_values() - expected = Series([3, 1, 4, 2], index=list('acbd')).sort_values() - tm.assert_series_equal(hist, expected) - - # sort ascending - hist = s.value_counts(ascending=True) - expected = Series([1, 2, 3, 4], index=list('cdab')) - tm.assert_series_equal(hist, expected) - - # relative histogram. - hist = s.value_counts(normalize=True) - expected = Series([.4, .3, .2, .1], index=['b', 'a', 'd', 'c']) - tm.assert_series_equal(hist, expected) - - def test_value_counts_bins(self): - klasses = [Index, Series] - for klass in klasses: - s_values = ['a', 'b', 'b', 'b', 'b', 'c', 'd', 'd', 'a', 'a'] - s = klass(s_values) - - # bins - pytest.raises(TypeError, lambda bins: s.value_counts(bins=bins), 1) - - s1 = Series([1, 1, 2, 3]) - res1 = s1.value_counts(bins=1) - exp1 = Series({Interval(0.997, 3.0): 4}) - tm.assert_series_equal(res1, exp1) - res1n = s1.value_counts(bins=1, normalize=True) - exp1n = Series({Interval(0.997, 3.0): 1.0}) - tm.assert_series_equal(res1n, exp1n) - - if isinstance(s1, Index): - tm.assert_index_equal(s1.unique(), Index([1, 2, 3])) - else: - exp = np.array([1, 2, 3], dtype=np.int64) - tm.assert_numpy_array_equal(s1.unique(), exp) - - assert s1.nunique() == 3 - - # these return the same - res4 = s1.value_counts(bins=4, dropna=True) - intervals = IntervalIndex.from_breaks([0.997, 1.5, 2.0, 2.5, 3.0]) - exp4 = Series([2, 1, 1, 0], index=intervals.take([0, 3, 1, 2])) - tm.assert_series_equal(res4, exp4) - - res4 = s1.value_counts(bins=4, dropna=False) - intervals = IntervalIndex.from_breaks([0.997, 1.5, 2.0, 2.5, 3.0]) - exp4 = Series([2, 1, 1, 0], index=intervals.take([0, 3, 1, 2])) - tm.assert_series_equal(res4, exp4) - - res4n = s1.value_counts(bins=4, normalize=True) - exp4n = Series([0.5, 0.25, 0.25, 0], - index=intervals.take([0, 3, 1, 2])) - tm.assert_series_equal(res4n, exp4n) - - # handle NA's properly - s_values = ['a', 'b', 'b', 'b', np.nan, np.nan, - 'd', 'd', 'a', 'a', 'b'] - s = klass(s_values) - expected = Series([4, 3, 2], index=['b', 'a', 'd']) - tm.assert_series_equal(s.value_counts(), expected) - - if isinstance(s, Index): - exp = Index(['a', 'b', np.nan, 'd']) - tm.assert_index_equal(s.unique(), exp) - else: - exp = np.array(['a', 'b', np.nan, 'd'], dtype=object) - tm.assert_numpy_array_equal(s.unique(), exp) - assert s.nunique() == 3 - - s = klass({}) - expected = Series([], dtype=np.int64) - tm.assert_series_equal(s.value_counts(), expected, - check_index_type=False) - # returned dtype differs depending on original - if isinstance(s, Index): - tm.assert_index_equal(s.unique(), Index([]), exact=False) - else: - tm.assert_numpy_array_equal(s.unique(), np.array([]), - check_dtype=False) + @pytest.mark.parametrize('klass', [Index, Series]) + def test_value_counts_inferred(self, klass): + s_values = ['a', 'b', 'b', 'b', 'b', 'c', 'd', 'd', 'a', 'a'] + s = klass(s_values) + expected = Series([4, 3, 2, 1], index=['b', 'a', 'd', 'c']) + tm.assert_series_equal(s.value_counts(), expected) + + if isinstance(s, Index): + exp = Index(np.unique(np.array(s_values, dtype=np.object_))) + tm.assert_index_equal(s.unique(), exp) + else: + exp = np.unique(np.array(s_values, dtype=np.object_)) + tm.assert_numpy_array_equal(s.unique(), exp) + + assert s.nunique() == 4 + # don't sort, have to sort after the fact as not sorting is + # platform-dep + hist = s.value_counts(sort=False).sort_values() + expected = Series([3, 1, 4, 2], index=list('acbd')).sort_values() + tm.assert_series_equal(hist, expected) + + # sort ascending + hist = s.value_counts(ascending=True) + expected = Series([1, 2, 3, 4], index=list('cdab')) + tm.assert_series_equal(hist, expected) + + # relative histogram. + hist = s.value_counts(normalize=True) + expected = Series([.4, .3, .2, .1], index=['b', 'a', 'd', 'c']) + tm.assert_series_equal(hist, expected) + + @pytest.mark.parametrize('klass', [Index, Series]) + def test_value_counts_bins(self, klass): + s_values = ['a', 'b', 'b', 'b', 'b', 'c', 'd', 'd', 'a', 'a'] + s = klass(s_values) + + # bins + with pytest.raises(TypeError): + s.value_counts(bins=1) + + s1 = Series([1, 1, 2, 3]) + res1 = s1.value_counts(bins=1) + exp1 = Series({Interval(0.997, 3.0): 4}) + tm.assert_series_equal(res1, exp1) + res1n = s1.value_counts(bins=1, normalize=True) + exp1n = Series({Interval(0.997, 3.0): 1.0}) + tm.assert_series_equal(res1n, exp1n) + + if isinstance(s1, Index): + tm.assert_index_equal(s1.unique(), Index([1, 2, 3])) + else: + exp = np.array([1, 2, 3], dtype=np.int64) + tm.assert_numpy_array_equal(s1.unique(), exp) + + assert s1.nunique() == 3 + + # these return the same + res4 = s1.value_counts(bins=4, dropna=True) + intervals = IntervalIndex.from_breaks([0.997, 1.5, 2.0, 2.5, 3.0]) + exp4 = Series([2, 1, 1, 0], index=intervals.take([0, 3, 1, 2])) + tm.assert_series_equal(res4, exp4) + + res4 = s1.value_counts(bins=4, dropna=False) + intervals = IntervalIndex.from_breaks([0.997, 1.5, 2.0, 2.5, 3.0]) + exp4 = Series([2, 1, 1, 0], index=intervals.take([0, 3, 1, 2])) + tm.assert_series_equal(res4, exp4) + + res4n = s1.value_counts(bins=4, normalize=True) + exp4n = Series([0.5, 0.25, 0.25, 0], + index=intervals.take([0, 3, 1, 2])) + tm.assert_series_equal(res4n, exp4n) + + # handle NA's properly + s_values = ['a', 'b', 'b', 'b', np.nan, np.nan, + 'd', 'd', 'a', 'a', 'b'] + s = klass(s_values) + expected = Series([4, 3, 2], index=['b', 'a', 'd']) + tm.assert_series_equal(s.value_counts(), expected) + + if isinstance(s, Index): + exp = Index(['a', 'b', np.nan, 'd']) + tm.assert_index_equal(s.unique(), exp) + else: + exp = np.array(['a', 'b', np.nan, 'd'], dtype=object) + tm.assert_numpy_array_equal(s.unique(), exp) + assert s.nunique() == 3 - assert s.nunique() == 0 + s = klass({}) + expected = Series([], dtype=np.int64) + tm.assert_series_equal(s.value_counts(), expected, + check_index_type=False) + # returned dtype differs depending on original + if isinstance(s, Index): + tm.assert_index_equal(s.unique(), Index([]), exact=False) + else: + tm.assert_numpy_array_equal(s.unique(), np.array([]), + check_dtype=False) + + assert s.nunique() == 0 @pytest.mark.parametrize('klass', [Index, Series]) def test_value_counts_datetime64(self, klass): @@ -1001,8 +937,10 @@ def test_getitem(self): assert i[-1] == i[9] - pytest.raises(IndexError, i.__getitem__, 20) - pytest.raises(IndexError, s.iloc.__getitem__, 20) + with pytest.raises(IndexError): + i[20] + with pytest.raises(IndexError): + s.iloc[20] @pytest.mark.parametrize('indexer_klass', [list, pd.Index]) @pytest.mark.parametrize('indexer', [[True] * 10, [False] * 10, @@ -1022,10 +960,7 @@ class TestTranspose(Ops): def test_transpose(self): for obj in self.objs: - if isinstance(obj, Index): - tm.assert_index_equal(obj.transpose(), obj) - else: - tm.assert_series_equal(obj.transpose(), obj) + tm.assert_equal(obj.transpose(), obj) def test_transpose_non_default_axes(self): for obj in self.objs: @@ -1036,10 +971,7 @@ def test_transpose_non_default_axes(self): def test_numpy_transpose(self): for obj in self.objs: - if isinstance(obj, Index): - tm.assert_index_equal(np.transpose(obj), obj) - else: - tm.assert_series_equal(np.transpose(obj), obj) + tm.assert_equal(np.transpose(obj), obj) with pytest.raises(ValueError, match=self.errmsg): np.transpose(obj, axes=1)