From 0e50e19321ca690e2c6bd3ad0f9ceedb862550b5 Mon Sep 17 00:00:00 2001 From: MarcoGorelli <> Date: Fri, 25 Nov 2022 09:01:14 +0000 Subject: [PATCH 01/23] sort out series / index case --- pandas/core/algorithms.py | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 381f76c4502d6..20efbdddb283c 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -842,7 +842,8 @@ def value_counts( Series, ) - name = getattr(values, "name", None) + index_name = getattr(values, "name", None) + name = "proportion" if normalize else "count" if bins is not None: from pandas.core.reshape.tile import cut @@ -869,18 +870,22 @@ def value_counts( else: if is_extension_array_dtype(values): - # handle Categorical and sparse, result = Series(values)._values.value_counts(dropna=dropna) result.name = name + result.index.name = index_name counts = result._values elif isinstance(values, ABCMultiIndex): # GH49558 levels = list(range(values.nlevels)) - result = Series(index=values).groupby(level=levels, dropna=dropna).size() + result = ( + Series(index=values, name=name) + .groupby(level=levels, dropna=dropna) + .size() + ) # TODO: allow index names to remain (see discussion in GH49497) - result.index.names = [None] * values.nlevels + result.index.names = values.names counts = result._values else: @@ -892,6 +897,7 @@ def value_counts( idx = Index._with_infer(keys) if idx.dtype == bool and keys.dtype == object: idx = idx.astype(object) + idx.name = index_name result = Series(counts, index=idx, name=name) From 214cc340857dd7766182dc2826d3079603dd0117 Mon Sep 17 00:00:00 2001 From: MarcoGorelli <> Date: Fri, 25 Nov 2022 09:25:56 +0000 Subject: [PATCH 02/23] getting there... --- .../tests/series/methods/test_value_counts.py | 81 ++++++++++++------- 1 file changed, 53 insertions(+), 28 deletions(-) diff --git a/pandas/tests/series/methods/test_value_counts.py b/pandas/tests/series/methods/test_value_counts.py index a7d57eee7e5a1..f54489ac8a8b4 100644 --- a/pandas/tests/series/methods/test_value_counts.py +++ b/pandas/tests/series/methods/test_value_counts.py @@ -5,6 +5,7 @@ from pandas import ( Categorical, CategoricalIndex, + Index, Series, ) import pandas._testing as tm @@ -23,9 +24,10 @@ def test_value_counts_datetime(self): ] exp_idx = pd.DatetimeIndex( - ["2011-01-01 09:00", "2011-01-01 11:00", "2011-01-01 10:00"] + ["2011-01-01 09:00", "2011-01-01 11:00", "2011-01-01 10:00"], + name="xxx", ) - exp = Series([3, 2, 1], index=exp_idx, name="xxx") + exp = Series([3, 2, 1], index=exp_idx, name="count") ser = Series(values, name="xxx") tm.assert_series_equal(ser.value_counts(), exp) @@ -34,7 +36,7 @@ def test_value_counts_datetime(self): tm.assert_series_equal(idx.value_counts(), exp) # normalize - exp = Series(np.array([3.0, 2.0, 1]) / 6.0, index=exp_idx, name="xxx") + exp = Series(np.array([3.0, 2.0, 1]) / 6.0, index=exp_idx, name="proportion") tm.assert_series_equal(ser.value_counts(normalize=True), exp) tm.assert_series_equal(idx.value_counts(normalize=True), exp) @@ -51,15 +53,16 @@ def test_value_counts_datetime_tz(self): exp_idx = pd.DatetimeIndex( ["2011-01-01 09:00", "2011-01-01 11:00", "2011-01-01 10:00"], tz="US/Eastern", + name="xxx", ) - exp = Series([3, 2, 1], index=exp_idx, name="xxx") + exp = Series([3, 2, 1], index=exp_idx, name="count") ser = Series(values, name="xxx") tm.assert_series_equal(ser.value_counts(), exp) idx = pd.DatetimeIndex(values, name="xxx") tm.assert_series_equal(idx.value_counts(), exp) - exp = Series(np.array([3.0, 2.0, 1]) / 6.0, index=exp_idx, name="xxx") + exp = Series(np.array([3.0, 2.0, 1]) / 6.0, index=exp_idx, name="proportion") tm.assert_series_equal(ser.value_counts(normalize=True), exp) tm.assert_series_equal(idx.value_counts(normalize=True), exp) @@ -73,8 +76,10 @@ def test_value_counts_period(self): pd.Period("2011-03", freq="M"), ] - exp_idx = pd.PeriodIndex(["2011-01", "2011-03", "2011-02"], freq="M") - exp = Series([3, 2, 1], index=exp_idx, name="xxx") + exp_idx = pd.PeriodIndex( + ["2011-01", "2011-03", "2011-02"], freq="M", name="xxx" + ) + exp = Series([3, 2, 1], index=exp_idx, name="count") ser = Series(values, name="xxx") tm.assert_series_equal(ser.value_counts(), exp) @@ -83,7 +88,7 @@ def test_value_counts_period(self): tm.assert_series_equal(idx.value_counts(), exp) # normalize - exp = Series(np.array([3.0, 2.0, 1]) / 6.0, index=exp_idx, name="xxx") + exp = Series(np.array([3.0, 2.0, 1]) / 6.0, index=exp_idx, name="proportion") tm.assert_series_equal(ser.value_counts(normalize=True), exp) tm.assert_series_equal(idx.value_counts(normalize=True), exp) @@ -91,8 +96,10 @@ def test_value_counts_categorical_ordered(self): # most dtypes are tested in tests/base values = Categorical([1, 2, 3, 1, 1, 3], ordered=True) - exp_idx = CategoricalIndex([1, 3, 2], categories=[1, 2, 3], ordered=True) - exp = Series([3, 2, 1], index=exp_idx, name="xxx") + exp_idx = CategoricalIndex( + [1, 3, 2], categories=[1, 2, 3], ordered=True, name="xxx" + ) + exp = Series([3, 2, 1], index=exp_idx, name="count") ser = Series(values, name="xxx") tm.assert_series_equal(ser.value_counts(), exp) @@ -101,15 +108,17 @@ def test_value_counts_categorical_ordered(self): tm.assert_series_equal(idx.value_counts(), exp) # normalize - exp = Series(np.array([3.0, 2.0, 1]) / 6.0, index=exp_idx, name="xxx") + exp = Series(np.array([3.0, 2.0, 1]) / 6.0, index=exp_idx, name="proportion") tm.assert_series_equal(ser.value_counts(normalize=True), exp) tm.assert_series_equal(idx.value_counts(normalize=True), exp) def test_value_counts_categorical_not_ordered(self): values = Categorical([1, 2, 3, 1, 1, 3], ordered=False) - exp_idx = CategoricalIndex([1, 3, 2], categories=[1, 2, 3], ordered=False) - exp = Series([3, 2, 1], index=exp_idx, name="xxx") + exp_idx = CategoricalIndex( + [1, 3, 2], categories=[1, 2, 3], ordered=False, name="xxx" + ) + exp = Series([3, 2, 1], index=exp_idx, name="count") ser = Series(values, name="xxx") tm.assert_series_equal(ser.value_counts(), exp) @@ -118,7 +127,7 @@ def test_value_counts_categorical_not_ordered(self): tm.assert_series_equal(idx.value_counts(), exp) # normalize - exp = Series(np.array([3.0, 2.0, 1]) / 6.0, index=exp_idx, name="xxx") + exp = Series(np.array([3.0, 2.0, 1]) / 6.0, index=exp_idx, name="proportion") tm.assert_series_equal(ser.value_counts(normalize=True), exp) tm.assert_series_equal(idx.value_counts(normalize=True), exp) @@ -128,21 +137,25 @@ def test_value_counts_categorical(self): ser = Series(cats, name="xxx") res = ser.value_counts(sort=False) - exp_index = CategoricalIndex(list("cabd"), categories=cats.categories) - exp = Series([3, 1, 2, 0], name="xxx", index=exp_index) + exp_index = CategoricalIndex( + list("cabd"), categories=cats.categories, name="xxx" + ) + exp = Series([3, 1, 2, 0], name="count", index=exp_index) tm.assert_series_equal(res, exp) res = ser.value_counts(sort=True) - exp_index = CategoricalIndex(list("cbad"), categories=cats.categories) - exp = Series([3, 2, 1, 0], name="xxx", index=exp_index) + exp_index = CategoricalIndex( + list("cbad"), categories=cats.categories, name="xxx" + ) + exp = Series([3, 2, 1, 0], name="count", index=exp_index) tm.assert_series_equal(res, exp) # check object dtype handles the Series.name as the same # (tested in tests/base) ser = Series(["a", "b", "c", "c", "c", "b"], name="xxx") res = ser.value_counts() - exp = Series([3, 2, 1], name="xxx", index=["c", "b", "a"]) + exp = Series([3, 2, 1], name="count", index=Index(["c", "b", "a"], name="xxx")) tm.assert_series_equal(res, exp) def test_value_counts_categorical_with_nan(self): @@ -150,7 +163,7 @@ def test_value_counts_categorical_with_nan(self): # sanity check ser = Series(["a", "b", "a"], dtype="category") - exp = Series([2, 1], index=CategoricalIndex(["a", "b"])) + exp = Series([2, 1], index=CategoricalIndex(["a", "b"]), name="count") res = ser.value_counts(dropna=True) tm.assert_series_equal(res, exp) @@ -168,18 +181,22 @@ def test_value_counts_categorical_with_nan(self): for ser in series: # None is a NaN value, so we exclude its count here - exp = Series([2, 1], index=CategoricalIndex(["a", "b"])) + exp = Series([2, 1], index=CategoricalIndex(["a", "b"]), name="count") res = ser.value_counts(dropna=True) tm.assert_series_equal(res, exp) # we don't exclude the count of None and sort by counts - exp = Series([3, 2, 1], index=CategoricalIndex([np.nan, "a", "b"])) + exp = Series( + [3, 2, 1], index=CategoricalIndex([np.nan, "a", "b"]), name="count" + ) res = ser.value_counts(dropna=False) tm.assert_series_equal(res, exp) # When we aren't sorting by counts, and np.nan isn't a # category, it should be last. - exp = Series([2, 1, 3], index=CategoricalIndex(["a", "b", np.nan])) + exp = Series( + [2, 1, 3], index=CategoricalIndex(["a", "b", np.nan]), name="count" + ) res = ser.value_counts(dropna=False, sort=False) tm.assert_series_equal(res, exp) @@ -189,17 +206,17 @@ def test_value_counts_categorical_with_nan(self): ( Series([False, True, True, pd.NA]), False, - Series([2, 1, 1], index=[True, False, pd.NA]), + Series([2, 1, 1], index=[True, False, pd.NA], name="count"), ), ( Series([False, True, True, pd.NA]), True, - Series([2, 1], index=pd.Index([True, False], dtype=object)), + Series([2, 1], index=Index([True, False], dtype=object), name="count"), ), ( Series(range(3), index=[True, False, np.nan]).index, False, - Series([1, 1, 1], index=[True, False, np.nan]), + Series([1, 1, 1], index=[True, False, np.nan], name="count"), ), ], ) @@ -213,11 +230,19 @@ def test_value_counts_bool_with_nan(self, ser, dropna, exp): [ ( [1 + 1j, 1 + 1j, 1, 3j, 3j, 3j], - Series([3, 2, 1], index=pd.Index([3j, 1 + 1j, 1], dtype=np.complex128)), + Series( + [3, 2, 1], + index=Index([3j, 1 + 1j, 1], dtype=np.complex128), + name="count", + ), ), ( np.array([1 + 1j, 1 + 1j, 1, 3j, 3j, 3j], dtype=np.complex64), - Series([3, 2, 1], index=pd.Index([3j, 1 + 1j, 1], dtype=np.complex64)), + Series( + [3, 2, 1], + index=Index([3j, 1 + 1j, 1], dtype=np.complex64), + name="count", + ), ), ], ) From c1fa2083ab1d62f07868f3def26b63b9a5873cc0 Mon Sep 17 00:00:00 2001 From: MarcoGorelli <> Date: Fri, 25 Nov 2022 09:44:53 +0000 Subject: [PATCH 03/23] only just fixed up another file! --- pandas/tests/base/test_value_counts.py | 50 ++++++++++++++++---------- 1 file changed, 31 insertions(+), 19 deletions(-) diff --git a/pandas/tests/base/test_value_counts.py b/pandas/tests/base/test_value_counts.py index dafbd9fee1b8e..26a08f450265d 100644 --- a/pandas/tests/base/test_value_counts.py +++ b/pandas/tests/base/test_value_counts.py @@ -13,6 +13,7 @@ Index, Interval, IntervalIndex, + MultiIndex, Series, Timedelta, TimedeltaIndex, @@ -27,8 +28,12 @@ def test_value_counts(index_or_series_obj): result = obj.value_counts() counter = collections.Counter(obj) - expected = Series(dict(counter.most_common()), dtype=np.int64, name=obj.name) + expected = Series(dict(counter.most_common()), dtype=np.int64, name="count") expected.index = expected.index.astype(obj.dtype) + if isinstance(expected.index, MultiIndex): + expected.index.names = obj.names + else: + expected.index.name = obj.name if not isinstance(result.dtype, np.dtype): # i.e IntegerDtype @@ -59,7 +64,7 @@ def test_value_counts_null(null_obj, index_or_series_obj): pytest.skip("type doesn't allow for NA operations") elif len(obj) < 1: pytest.skip("Test doesn't make sense on empty data") - elif isinstance(orig, pd.MultiIndex): + elif isinstance(orig, MultiIndex): pytest.skip(f"MultiIndex can't hold '{null_obj}'") values = obj._values @@ -72,8 +77,9 @@ def test_value_counts_null(null_obj, index_or_series_obj): # because np.nan == np.nan is False, but None == None is True # np.nan would be duplicated, whereas None wouldn't counter = collections.Counter(obj.dropna()) - expected = Series(dict(counter.most_common()), dtype=np.int64) + expected = Series(dict(counter.most_common()), dtype=np.int64, name="count") expected.index = expected.index.astype(obj.dtype) + expected.index.name = obj.name result = obj.value_counts() if obj.duplicated().any(): @@ -118,7 +124,7 @@ def test_value_counts_inferred(index_or_series): klass = index_or_series s_values = ["a", "b", "b", "b", "b", "c", "d", "d", "a", "a"] s = klass(s_values) - expected = Series([4, 3, 2, 1], index=["b", "a", "d", "c"]) + expected = Series([4, 3, 2, 1], index=["b", "a", "d", "c"], name="count") tm.assert_series_equal(s.value_counts(), expected) if isinstance(s, Index): @@ -132,17 +138,19 @@ def test_value_counts_inferred(index_or_series): # don't sort, have to sort after the fact as not sorting is # platform-dep hist = s.value_counts(sort=False).sort_values() - expected = Series([3, 1, 4, 2], index=list("acbd")).sort_values() + expected = Series([3, 1, 4, 2], index=list("acbd"), name="count").sort_values() tm.assert_series_equal(hist, expected) # sort ascending hist = s.value_counts(ascending=True) - expected = Series([1, 2, 3, 4], index=list("cdab")) + expected = Series([1, 2, 3, 4], index=list("cdab"), name="count") tm.assert_series_equal(hist, expected) # relative histogram. hist = s.value_counts(normalize=True) - expected = Series([0.4, 0.3, 0.2, 0.1], index=["b", "a", "d", "c"]) + expected = Series( + [0.4, 0.3, 0.2, 0.1], index=["b", "a", "d", "c"], name="proportion" + ) tm.assert_series_equal(hist, expected) @@ -158,10 +166,10 @@ def test_value_counts_bins(index_or_series): s1 = Series([1, 1, 2, 3]) res1 = s1.value_counts(bins=1) - exp1 = Series({Interval(0.997, 3.0): 4}) + exp1 = Series({Interval(0.997, 3.0): 4}, name="count") tm.assert_series_equal(res1, exp1) res1n = s1.value_counts(bins=1, normalize=True) - exp1n = Series({Interval(0.997, 3.0): 1.0}) + exp1n = Series({Interval(0.997, 3.0): 1.0}, name="count") tm.assert_series_equal(res1n, exp1n) if isinstance(s1, Index): @@ -175,22 +183,24 @@ def test_value_counts_bins(index_or_series): # these return the same res4 = s1.value_counts(bins=4, dropna=True) intervals = IntervalIndex.from_breaks([0.997, 1.5, 2.0, 2.5, 3.0]) - exp4 = Series([2, 1, 1, 0], index=intervals.take([0, 1, 3, 2])) + exp4 = Series([2, 1, 1, 0], index=intervals.take([0, 1, 3, 2]), name="count") tm.assert_series_equal(res4, exp4) res4 = s1.value_counts(bins=4, dropna=False) intervals = IntervalIndex.from_breaks([0.997, 1.5, 2.0, 2.5, 3.0]) - exp4 = Series([2, 1, 1, 0], index=intervals.take([0, 1, 3, 2])) + exp4 = Series([2, 1, 1, 0], index=intervals.take([0, 1, 3, 2]), name="count") tm.assert_series_equal(res4, exp4) res4n = s1.value_counts(bins=4, normalize=True) - exp4n = Series([0.5, 0.25, 0.25, 0], index=intervals.take([0, 1, 3, 2])) + exp4n = Series( + [0.5, 0.25, 0.25, 0], index=intervals.take([0, 1, 3, 2]), name="count" + ) tm.assert_series_equal(res4n, exp4n) # handle NA's properly s_values = ["a", "b", "b", "b", np.nan, np.nan, "d", "d", "a", "a", "b"] s = klass(s_values) - expected = Series([4, 3, 2], index=["b", "a", "d"]) + expected = Series([4, 3, 2], index=["b", "a", "d"], name="count") tm.assert_series_equal(s.value_counts(), expected) if isinstance(s, Index): @@ -202,7 +212,7 @@ def test_value_counts_bins(index_or_series): assert s.nunique() == 3 s = klass({}) if klass is dict else klass({}, dtype=object) - expected = Series([], dtype=np.int64) + expected = Series([], dtype=np.int64, name="count") tm.assert_series_equal(s.value_counts(), expected, check_index_type=False) # returned dtype differs depending on original if isinstance(s, Index): @@ -240,7 +250,7 @@ def test_value_counts_datetime64(index_or_series): idx = pd.to_datetime( ["2010-01-01 00:00:00", "2008-09-09 00:00:00", "2009-01-01 00:00:00"] ) - expected_s = Series([3, 2, 1], index=idx) + expected_s = Series([3, 2, 1], index=idx, name="count") tm.assert_series_equal(s.value_counts(), expected_s) expected = pd.array( @@ -265,7 +275,9 @@ def test_value_counts_datetime64(index_or_series): tm.assert_series_equal(result, expected_s) result = s.value_counts(dropna=False) - expected_s = pd.concat([Series([4], index=DatetimeIndex([pd.NaT])), expected_s]) + expected_s = pd.concat( + [Series([4], index=DatetimeIndex([pd.NaT]), name="count"), expected_s] + ) tm.assert_series_equal(result, expected_s) assert s.dtype == "datetime64[ns]" @@ -288,7 +300,7 @@ def test_value_counts_datetime64(index_or_series): td = klass(td, name="dt") result = td.value_counts() - expected_s = Series([6], index=[Timedelta("1day")], name="dt") + expected_s = Series([6], index=Index([Timedelta("1day")], name="dt"), name="count") tm.assert_series_equal(result, expected_s) expected = TimedeltaIndex(["1 days"], name="dt") @@ -311,7 +323,7 @@ def test_value_counts_with_nan(dropna, index_or_series): obj = klass(values) res = obj.value_counts(dropna=dropna) if dropna is True: - expected = Series([1], index=Index([True], dtype=obj.dtype)) + expected = Series([1], index=Index([True], dtype=obj.dtype), name="count") else: - expected = Series([1, 1, 1], index=[True, pd.NA, np.nan]) + expected = Series([1, 1, 1], index=[True, pd.NA, np.nan], name="count") tm.assert_series_equal(res, expected) From d872960bf09649901fd66380f670f59b4e1e59d2 Mon Sep 17 00:00:00 2001 From: MarcoGorelli <> Date: Fri, 25 Nov 2022 10:14:32 +0000 Subject: [PATCH 04/23] more fixups --- pandas/core/arrays/arrow/array.py | 2 +- pandas/core/arrays/categorical.py | 2 +- pandas/core/frame.py | 3 ++- pandas/tests/arrays/boolean/test_function.py | 2 +- pandas/tests/arrays/floating/test_function.py | 4 ++-- pandas/tests/arrays/integer/test_function.py | 4 ++-- pandas/tests/arrays/string_/test_string.py | 6 +++--- pandas/tests/arrays/test_datetimes.py | 2 +- pandas/tests/extension/base/methods.py | 6 ++++-- pandas/tests/frame/methods/test_value_counts.py | 11 +++++++++-- .../tests/indexes/datetimelike_/test_value_counts.py | 6 +++--- 11 files changed, 29 insertions(+), 19 deletions(-) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 92611752415fe..f579efb6352d8 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -795,7 +795,7 @@ def value_counts(self, dropna: bool = True) -> Series: index = Index(type(self)(values)) - return Series(counts, index=index).astype("Int64") + return Series(counts, index=index, name="count").astype("Int64") @classmethod def _concat_same_type( diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index a9af210e08741..37a1bca7a0d66 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -1508,7 +1508,7 @@ def value_counts(self, dropna: bool = True) -> Series: ix = coerce_indexer_dtype(ix, self.dtype.categories) ix = self._from_backing_data(ix) - return Series(count, index=CategoricalIndex(ix), dtype="int64") + return Series(count, index=CategoricalIndex(ix), dtype="int64", name="count") # error: Argument 2 of "_empty" is incompatible with supertype # "NDArrayBackedExtensionArray"; supertype defines the argument type as diff --git a/pandas/core/frame.py b/pandas/core/frame.py index f3b55533919dd..a4550a531e918 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -7107,7 +7107,8 @@ def value_counts( if subset is None: subset = self.columns.tolist() - counts = self.groupby(subset, dropna=dropna).grouper.size() + name = "proportion" if normalize else "count" + counts = self.groupby(subset, dropna=dropna).grouper.size().rename(name) if sort: counts = counts.sort_values(ascending=ascending) diff --git a/pandas/tests/arrays/boolean/test_function.py b/pandas/tests/arrays/boolean/test_function.py index 8e9112b531fad..16c811927dbab 100644 --- a/pandas/tests/arrays/boolean/test_function.py +++ b/pandas/tests/arrays/boolean/test_function.py @@ -105,7 +105,7 @@ def test_value_counts_na(): def test_value_counts_with_normalize(): ser = pd.Series([True, False, pd.NA], dtype="boolean") result = ser.value_counts(normalize=True) - expected = pd.Series([1, 1], index=ser[:-1], dtype="Float64") / 2 + expected = pd.Series([1, 1], index=ser[:-1], dtype="Float64", name="proportion") / 2 assert expected.index.dtype == "boolean" tm.assert_series_equal(result, expected) diff --git a/pandas/tests/arrays/floating/test_function.py b/pandas/tests/arrays/floating/test_function.py index fbdf419811e24..ee135f7003c64 100644 --- a/pandas/tests/arrays/floating/test_function.py +++ b/pandas/tests/arrays/floating/test_function.py @@ -113,14 +113,14 @@ def test_value_counts_empty(): result = ser.value_counts() idx = pd.Index([], dtype="Float64") assert idx.dtype == "Float64" - expected = pd.Series([], index=idx, dtype="Int64") + expected = pd.Series([], index=idx, dtype="Int64", name="count") tm.assert_series_equal(result, expected) def test_value_counts_with_normalize(): ser = pd.Series([0.1, 0.2, 0.1, pd.NA], dtype="Float64") result = ser.value_counts(normalize=True) - expected = pd.Series([2, 1], index=ser[:2], dtype="Float64") / 3 + expected = pd.Series([2, 1], index=ser[:2], dtype="Float64", name="proportion") / 3 assert expected.index.dtype == ser.dtype tm.assert_series_equal(result, expected) diff --git a/pandas/tests/arrays/integer/test_function.py b/pandas/tests/arrays/integer/test_function.py index 73c8d4e6b1aed..ed9628a1e98b1 100644 --- a/pandas/tests/arrays/integer/test_function.py +++ b/pandas/tests/arrays/integer/test_function.py @@ -126,7 +126,7 @@ def test_value_counts_empty(): result = ser.value_counts() idx = pd.Index([], dtype=ser.dtype) assert idx.dtype == ser.dtype - expected = pd.Series([], index=idx, dtype="Int64") + expected = pd.Series([], index=idx, dtype="Int64", name="count") tm.assert_series_equal(result, expected) @@ -134,7 +134,7 @@ def test_value_counts_with_normalize(): # GH 33172 ser = pd.Series([1, 2, 1, pd.NA], dtype="Int64") result = ser.value_counts(normalize=True) - expected = pd.Series([2, 1], index=ser[:2], dtype="Float64") / 3 + expected = pd.Series([2, 1], index=ser[:2], dtype="Float64", name="proportion") / 3 assert expected.index.dtype == ser.dtype tm.assert_series_equal(result, expected) diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py index 8d8d9ce20cefd..be7393e05c068 100644 --- a/pandas/tests/arrays/string_/test_string.py +++ b/pandas/tests/arrays/string_/test_string.py @@ -481,18 +481,18 @@ def test_arrow_load_from_zero_chunks(dtype, string_storage2): def test_value_counts_na(dtype): arr = pd.array(["a", "b", "a", pd.NA], dtype=dtype) result = arr.value_counts(dropna=False) - expected = pd.Series([2, 1, 1], index=arr[[0, 1, 3]], dtype="Int64") + expected = pd.Series([2, 1, 1], index=arr[[0, 1, 3]], dtype="Int64", name="count") tm.assert_series_equal(result, expected) result = arr.value_counts(dropna=True) - expected = pd.Series([2, 1], index=arr[:2], dtype="Int64") + expected = pd.Series([2, 1], index=arr[:2], dtype="Int64", name="count") tm.assert_series_equal(result, expected) def test_value_counts_with_normalize(dtype): ser = pd.Series(["a", "b", "a", pd.NA], dtype=dtype) result = ser.value_counts(normalize=True) - expected = pd.Series([2, 1], index=ser[:2], dtype="Float64") / 3 + expected = pd.Series([2, 1], index=ser[:2], dtype="Float64", name="proportion") / 3 tm.assert_series_equal(result, expected) diff --git a/pandas/tests/arrays/test_datetimes.py b/pandas/tests/arrays/test_datetimes.py index 8c4701bb2f8ee..56ab13c264344 100644 --- a/pandas/tests/arrays/test_datetimes.py +++ b/pandas/tests/arrays/test_datetimes.py @@ -487,7 +487,7 @@ def test_value_counts_preserves_tz(self): arr[-2] = pd.NaT result = arr.value_counts(dropna=False) - expected = pd.Series([4, 2, 1], index=[dti[0], dti[1], pd.NaT]) + expected = pd.Series([4, 2, 1], index=[dti[0], dti[1], pd.NaT], name="count") tm.assert_series_equal(result, expected) @pytest.mark.parametrize("method", ["pad", "backfill"]) diff --git a/pandas/tests/extension/base/methods.py b/pandas/tests/extension/base/methods.py index 2df410dff2b00..1b078d5395f77 100644 --- a/pandas/tests/extension/base/methods.py +++ b/pandas/tests/extension/base/methods.py @@ -46,9 +46,11 @@ def test_value_counts_with_normalize(self, data): result = ser.value_counts(normalize=True).sort_index() if not isinstance(data, pd.Categorical): - expected = pd.Series([1 / len(values)] * len(values), index=result.index) + expected = pd.Series( + [1 / len(values)] * len(values), index=result.index, name="proportion" + ) else: - expected = pd.Series(0.0, index=result.index) + expected = pd.Series(0.0, index=result.index, name="proportion") expected[result > 0] = 1 / len(values) if na_value_for_dtype(data.dtype) is pd.NA: # TODO(GH#44692): avoid special-casing diff --git a/pandas/tests/frame/methods/test_value_counts.py b/pandas/tests/frame/methods/test_value_counts.py index 6e8528845ea6b..a785b532ee8c2 100644 --- a/pandas/tests/frame/methods/test_value_counts.py +++ b/pandas/tests/frame/methods/test_value_counts.py @@ -16,6 +16,7 @@ def test_data_frame_value_counts_unsorted(): index=pd.MultiIndex.from_arrays( [(2, 4, 6), (2, 0, 0)], names=["num_legs", "num_wings"] ), + name="count", ) tm.assert_series_equal(result, expected) @@ -33,6 +34,7 @@ def test_data_frame_value_counts_ascending(): index=pd.MultiIndex.from_arrays( [(2, 6, 4), (2, 0, 0)], names=["num_legs", "num_wings"] ), + name="count", ) tm.assert_series_equal(result, expected) @@ -50,6 +52,7 @@ def test_data_frame_value_counts_default(): index=pd.MultiIndex.from_arrays( [(4, 2, 6), (0, 2, 0)], names=["num_legs", "num_wings"] ), + name="count", ) tm.assert_series_equal(result, expected) @@ -67,6 +70,7 @@ def test_data_frame_value_counts_normalize(): index=pd.MultiIndex.from_arrays( [(4, 2, 6), (0, 2, 0)], names=["num_legs", "num_wings"] ), + name="proportion", ) tm.assert_series_equal(result, expected) @@ -79,6 +83,7 @@ def test_data_frame_value_counts_single_col_default(): expected = pd.Series( data=[2, 1, 1], index=pd.MultiIndex.from_arrays([[4, 2, 6]], names=["num_legs"]), + name="count", ) tm.assert_series_equal(result, expected) @@ -88,7 +93,7 @@ def test_data_frame_value_counts_empty(): df_no_cols = pd.DataFrame() result = df_no_cols.value_counts() - expected = pd.Series([], dtype=np.int64) + expected = pd.Series([], dtype=np.int64, name="count") tm.assert_series_equal(result, expected) @@ -97,7 +102,7 @@ def test_data_frame_value_counts_empty_normalize(): df_no_cols = pd.DataFrame() result = df_no_cols.value_counts(normalize=True) - expected = pd.Series([], dtype=np.float64) + expected = pd.Series([], dtype=np.float64, name="proportion") tm.assert_series_equal(result, expected) @@ -116,6 +121,7 @@ def test_data_frame_value_counts_dropna_true(nulls_fixture): index=pd.MultiIndex.from_arrays( [("Beth", "John"), ("Louise", "Smith")], names=["first_name", "middle_name"] ), + name="count", ) tm.assert_series_equal(result, expected) @@ -141,6 +147,7 @@ def test_data_frame_value_counts_dropna_false(nulls_fixture): codes=[[0, 1, 2, 2], [2, 0, 1, 2]], names=["first_name", "middle_name"], ), + name="count", ) tm.assert_series_equal(result, expected) diff --git a/pandas/tests/indexes/datetimelike_/test_value_counts.py b/pandas/tests/indexes/datetimelike_/test_value_counts.py index f0df6dd678ef5..a0f05a1a35d79 100644 --- a/pandas/tests/indexes/datetimelike_/test_value_counts.py +++ b/pandas/tests/indexes/datetimelike_/test_value_counts.py @@ -38,7 +38,7 @@ def _check_value_counts_with_repeats(self, orig): exp_idx = orig[::-1] if not isinstance(exp_idx, PeriodIndex): exp_idx = exp_idx._with_freq(None) - expected = Series(range(10, 0, -1), index=exp_idx, dtype="int64") + expected = Series(range(10, 0, -1), index=exp_idx, dtype="int64", name="count") for obj in [idx, Series(idx)]: tm.assert_series_equal(obj.value_counts(), expected) @@ -89,13 +89,13 @@ def test_value_counts_unique_periodindex2(self): def _check_value_counts_dropna(self, idx): exp_idx = idx[[2, 3]] - expected = Series([3, 2], index=exp_idx) + expected = Series([3, 2], index=exp_idx, name="count") for obj in [idx, Series(idx)]: tm.assert_series_equal(obj.value_counts(), expected) exp_idx = idx[[2, 3, -1]] - expected = Series([3, 2, 1], index=exp_idx) + expected = Series([3, 2, 1], index=exp_idx, name="count") for obj in [idx, Series(idx)]: tm.assert_series_equal(obj.value_counts(dropna=False), expected) From f7cf72a97721c5ce446f6ff0cd0d71656800b947 Mon Sep 17 00:00:00 2001 From: MarcoGorelli <> Date: Fri, 25 Nov 2022 10:51:49 +0000 Subject: [PATCH 05/23] getting there --- pandas/core/algorithms.py | 3 +-- pandas/core/arrays/masked.py | 4 +-- pandas/core/groupby/generic.py | 26 ++++++++++--------- pandas/tests/arrays/boolean/test_function.py | 4 +-- pandas/tests/arrays/floating/test_function.py | 4 +-- pandas/tests/arrays/integer/test_function.py | 4 +-- pandas/tests/groupby/test_value_counts.py | 4 +-- 7 files changed, 25 insertions(+), 24 deletions(-) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 20efbdddb283c..3f44277d9f2f0 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -871,8 +871,7 @@ def value_counts( if is_extension_array_dtype(values): # handle Categorical and sparse, - result = Series(values)._values.value_counts(dropna=dropna) - result.name = name + result = Series(values)._values.value_counts(dropna=dropna).rename(name) result.index.name = index_name counts = result._values diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py index 434d6198210f7..25776f92ee44c 100644 --- a/pandas/core/arrays/masked.py +++ b/pandas/core/arrays/masked.py @@ -974,7 +974,7 @@ def value_counts(self, dropna: bool = True) -> Series: ) if dropna: - res = Series(value_counts, index=keys) + res = Series(value_counts, index=keys, name="count") res.index = res.index.astype(self.dtype) res = res.astype("Int64") return res @@ -990,7 +990,7 @@ def value_counts(self, dropna: bool = True) -> Series: mask = np.zeros(len(counts), dtype="bool") counts_array = IntegerArray(counts, mask) - return Series(counts_array, index=index) + return Series(counts_array, index=index, name="count") @doc(ExtensionArray.equals) def equals(self, other) -> bool: diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index a9ef1fca7e8e7..aa03ee074e60d 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -611,7 +611,8 @@ def value_counts( ids, _, _ = self.grouper.group_info val = self.obj._values - names = self.grouper.names + [self.obj.name] + index_names = self.grouper.names + [self.obj.name] + name = "proportion" if normalize else "count" if is_categorical_dtype(val.dtype) or ( bins is not None and not np.iterable(bins) @@ -625,8 +626,8 @@ def value_counts( sort=sort, ascending=ascending, bins=bins, - ) - ser.index.names = names + ).rename(name) + ser.index.names = index_names return ser # groupby removes null keys from groupings @@ -736,11 +737,13 @@ def build_codes(lev_codes: np.ndarray) -> np.ndarray: codes = [build_codes(lev_codes) for lev_codes in codes[:-1]] codes.append(left[-1]) - mi = MultiIndex(levels=levels, codes=codes, names=names, verify_integrity=False) + mi = MultiIndex( + levels=levels, codes=codes, names=index_names, verify_integrity=False + ) if is_integer_dtype(out.dtype): out = ensure_int64(out) - return self.obj._constructor(out, index=mi, name=self.obj.name) + return self.obj._constructor(out, index=mi, name=name) def fillna( self, @@ -1960,6 +1963,7 @@ def value_counts( raise NotImplementedError( "DataFrameGroupBy.value_counts only handles axis=0" ) + name = "proportion" if normalize else "count" with self._group_selection_context(): df = self.obj @@ -1968,8 +1972,8 @@ def value_counts( grouping.name for grouping in self.grouper.groupings if grouping.in_axis } if isinstance(self._selected_obj, Series): - name = self._selected_obj.name - keys = [] if name in in_axis_names else [self._selected_obj] + _name = self._selected_obj.name + keys = [] if _name in in_axis_names else [self._selected_obj] else: unique_cols = set(self._selected_obj.columns) if subset is not None: @@ -1992,8 +1996,8 @@ def value_counts( keys = [ # Can't use .values because the column label needs to be preserved self._selected_obj.iloc[:, idx] - for idx, name in enumerate(self._selected_obj.columns) - if name not in in_axis_names and name in subsetted + for idx, _name in enumerate(self._selected_obj.columns) + if _name not in in_axis_names and _name in subsetted ] groupings = list(self.grouper.groupings) @@ -2015,7 +2019,7 @@ def value_counts( observed=self.observed, dropna=self.dropna, ) - result_series = cast(Series, gb.size()) + result_series = cast(Series, gb.size().rename(name)) # GH-46357 Include non-observed categories # of non-grouping columns regardless of `observed` @@ -2059,14 +2063,12 @@ def value_counts( result = result_series else: # Convert to frame - name = "proportion" if normalize else "count" index = result_series.index columns = com.fill_missing_names(index.names) if name in columns: raise ValueError( f"Column label '{name}' is duplicate of result column" ) - result_series.name = name result_series.index = index.set_names(range(len(columns))) result_frame = result_series.reset_index() result_frame.columns = columns + [name] diff --git a/pandas/tests/arrays/boolean/test_function.py b/pandas/tests/arrays/boolean/test_function.py index 16c811927dbab..d6a4eb19d1dd3 100644 --- a/pandas/tests/arrays/boolean/test_function.py +++ b/pandas/tests/arrays/boolean/test_function.py @@ -92,12 +92,12 @@ def test_ufunc_reduce_raises(values): def test_value_counts_na(): arr = pd.array([True, False, pd.NA], dtype="boolean") result = arr.value_counts(dropna=False) - expected = pd.Series([1, 1, 1], index=arr, dtype="Int64") + expected = pd.Series([1, 1, 1], index=arr, dtype="Int64", name="count") assert expected.index.dtype == arr.dtype tm.assert_series_equal(result, expected) result = arr.value_counts(dropna=True) - expected = pd.Series([1, 1], index=arr[:-1], dtype="Int64") + expected = pd.Series([1, 1], index=arr[:-1], dtype="Int64", name="count") assert expected.index.dtype == arr.dtype tm.assert_series_equal(result, expected) diff --git a/pandas/tests/arrays/floating/test_function.py b/pandas/tests/arrays/floating/test_function.py index ee135f7003c64..430b90db52325 100644 --- a/pandas/tests/arrays/floating/test_function.py +++ b/pandas/tests/arrays/floating/test_function.py @@ -100,11 +100,11 @@ def test_value_counts_na(): result = arr.value_counts(dropna=False) idx = pd.Index([0.1, 0.2, pd.NA], dtype=arr.dtype) assert idx.dtype == arr.dtype - expected = pd.Series([2, 1, 1], index=idx, dtype="Int64") + expected = pd.Series([2, 1, 1], index=idx, dtype="Int64", name="count") tm.assert_series_equal(result, expected) result = arr.value_counts(dropna=True) - expected = pd.Series([2, 1], index=idx[:-1], dtype="Int64") + expected = pd.Series([2, 1], index=idx[:-1], dtype="Int64", name="count") tm.assert_series_equal(result, expected) diff --git a/pandas/tests/arrays/integer/test_function.py b/pandas/tests/arrays/integer/test_function.py index ed9628a1e98b1..34aed6398fa83 100644 --- a/pandas/tests/arrays/integer/test_function.py +++ b/pandas/tests/arrays/integer/test_function.py @@ -111,11 +111,11 @@ def test_value_counts_na(): result = arr.value_counts(dropna=False) ex_index = pd.Index([1, 2, pd.NA], dtype="Int64") assert ex_index.dtype == "Int64" - expected = pd.Series([2, 1, 1], index=ex_index, dtype="Int64") + expected = pd.Series([2, 1, 1], index=ex_index, dtype="Int64", name="count") tm.assert_series_equal(result, expected) result = arr.value_counts(dropna=True) - expected = pd.Series([2, 1], index=arr[:2], dtype="Int64") + expected = pd.Series([2, 1], index=arr[:2], dtype="Int64", name="count") assert expected.index.dtype == arr.dtype tm.assert_series_equal(result, expected) diff --git a/pandas/tests/groupby/test_value_counts.py b/pandas/tests/groupby/test_value_counts.py index 577a72d3f5090..ad3ffade0dced 100644 --- a/pandas/tests/groupby/test_value_counts.py +++ b/pandas/tests/groupby/test_value_counts.py @@ -37,7 +37,7 @@ def tests_value_counts_index_names_category_column(): df_mi_expected = DataFrame([["US", "female"]], columns=["country", "gender"]) df_mi_expected["gender"] = df_mi_expected["gender"].astype("category") mi_expected = MultiIndex.from_frame(df_mi_expected) - expected = Series([1], index=mi_expected, name="gender") + expected = Series([1], index=mi_expected, name="count") tm.assert_series_equal(result, expected) @@ -149,7 +149,7 @@ def test_series_groupby_value_counts_empty(columns): dfg = df.groupby(columns[:-1]) result = dfg[columns[-1]].value_counts() - expected = Series([], name=columns[-1], dtype=result.dtype) + expected = Series([], dtype=result.dtype, name="count") expected.index = MultiIndex.from_arrays([[]] * len(columns), names=columns) tm.assert_series_equal(result, expected) From 6e281e1f128ebd5e8cab899b1e2d64cc8ae2e5fd Mon Sep 17 00:00:00 2001 From: MarcoGorelli <> Date: Fri, 25 Nov 2022 11:49:47 +0000 Subject: [PATCH 06/23] wip, do this apply thing separately --- pandas/core/algorithms.py | 2 +- pandas/core/base.py | 3 ++- pandas/core/groupby/generic.py | 4 ++-- pandas/core/groupby/groupby.py | 8 ++++---- pandas/tests/groupby/test_value_counts.py | 3 ++- 5 files changed, 11 insertions(+), 9 deletions(-) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 3f44277d9f2f0..8fa62d7160ab3 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -855,7 +855,7 @@ def value_counts( raise TypeError("bins argument only works with numeric data.") from err # count, remove nulls (from the index), and but the bins - result = ii.value_counts(dropna=dropna) + result = ii.value_counts(dropna=dropna).rename(name) result = result[result.index.notna()] result.index = result.index.astype("interval") result = result.sort_index() diff --git a/pandas/core/base.py b/pandas/core/base.py index fb563fd640cfd..c18aed5c8a381 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -986,7 +986,7 @@ def value_counts( NaN 1 dtype: int64 """ - return algorithms.value_counts( + ret = algorithms.value_counts( self, sort=sort, ascending=ascending, @@ -994,6 +994,7 @@ def value_counts( bins=bins, dropna=dropna, ) + return ret def unique(self): values = self._values diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index aa03ee074e60d..80ed1ebea38da 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -384,7 +384,6 @@ def _wrap_applied_output( dtype=data.dtype, ) assert values is not None - if isinstance(values[0], dict): # GH #823 #24880 index = self.grouper.result_index @@ -401,7 +400,8 @@ def _wrap_applied_output( not_indexed_same=not_indexed_same, is_transform=is_transform, ) - result.name = self.obj.name + if result.name is None: + result.name = self.obj.name return result else: # GH #6265 #24880 diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 733cf7f262f46..ac7b98324eb40 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -1176,10 +1176,10 @@ def reset_identity(values): values = reset_identity(values) result = concat(values, axis=self.axis) - name = self.obj.name if self.obj.ndim == 1 else self._selection - if isinstance(result, Series) and name is not None: - - result.name = name + if result.name is None: + name = self.obj.name if self.obj.ndim == 1 else self._selection + if isinstance(result, Series) and name is not None: + result.name = name return result diff --git a/pandas/tests/groupby/test_value_counts.py b/pandas/tests/groupby/test_value_counts.py index ad3ffade0dced..d35c56ec57623 100644 --- a/pandas/tests/groupby/test_value_counts.py +++ b/pandas/tests/groupby/test_value_counts.py @@ -162,7 +162,7 @@ def test_series_groupby_value_counts_one_row(columns): dfg = df.groupby(columns[:-1]) result = dfg[columns[-1]].value_counts() - expected = df.value_counts().rename(columns[-1]) + expected = df.value_counts() tm.assert_series_equal(result, expected) @@ -183,6 +183,7 @@ def test_series_groupby_value_counts_on_categorical(): ), ] ), + name="count", ) # Expected: From 0fc0becc6e2f0dcbf9f5eaacb40a438e866dfb6c Mon Sep 17 00:00:00 2001 From: MarcoGorelli <> Date: Fri, 25 Nov 2022 14:17:06 +0000 Subject: [PATCH 07/23] fixup --- pandas/core/groupby/generic.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 80ed1ebea38da..aa03ee074e60d 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -384,6 +384,7 @@ def _wrap_applied_output( dtype=data.dtype, ) assert values is not None + if isinstance(values[0], dict): # GH #823 #24880 index = self.grouper.result_index @@ -400,8 +401,7 @@ def _wrap_applied_output( not_indexed_same=not_indexed_same, is_transform=is_transform, ) - if result.name is None: - result.name = self.obj.name + result.name = self.obj.name return result else: # GH #6265 #24880 From df6a108671e45a3e3b8a3d696c775576a24b3f75 Mon Sep 17 00:00:00 2001 From: MarcoGorelli <> Date: Fri, 25 Nov 2022 14:19:42 +0000 Subject: [PATCH 08/23] more fixup --- pandas/core/groupby/groupby.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index ac7b98324eb40..34a9d01a048ba 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -1176,10 +1176,9 @@ def reset_identity(values): values = reset_identity(values) result = concat(values, axis=self.axis) - if result.name is None: - name = self.obj.name if self.obj.ndim == 1 else self._selection - if isinstance(result, Series) and name is not None: - result.name = name + name = self.obj.name if self.obj.ndim == 1 else self._selection + if isinstance(result, Series) and name is not None: + result.name = name return result From 94187e4f2d879a6262498232364d7c1cca2889c3 Mon Sep 17 00:00:00 2001 From: MarcoGorelli <> Date: Fri, 25 Nov 2022 14:31:28 +0000 Subject: [PATCH 09/23] tmp fixup per gh49909 --- pandas/core/base.py | 3 +-- pandas/core/groupby/groupby.py | 1 + pandas/tests/groupby/test_value_counts.py | 14 ++++++++++++-- 3 files changed, 14 insertions(+), 4 deletions(-) diff --git a/pandas/core/base.py b/pandas/core/base.py index c18aed5c8a381..fb563fd640cfd 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -986,7 +986,7 @@ def value_counts( NaN 1 dtype: int64 """ - ret = algorithms.value_counts( + return algorithms.value_counts( self, sort=sort, ascending=ascending, @@ -994,7 +994,6 @@ def value_counts( bins=bins, dropna=dropna, ) - return ret def unique(self): values = self._values diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 34a9d01a048ba..733cf7f262f46 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -1178,6 +1178,7 @@ def reset_identity(values): name = self.obj.name if self.obj.ndim == 1 else self._selection if isinstance(result, Series) and name is not None: + result.name = name return result diff --git a/pandas/tests/groupby/test_value_counts.py b/pandas/tests/groupby/test_value_counts.py index d35c56ec57623..0aa57ab9d8bc6 100644 --- a/pandas/tests/groupby/test_value_counts.py +++ b/pandas/tests/groupby/test_value_counts.py @@ -82,12 +82,18 @@ def seed_df(seed_nans, n, m): @pytest.mark.slow @pytest.mark.parametrize("df, keys, bins, n, m", binned, ids=ids) @pytest.mark.parametrize("isort", [True, False]) -@pytest.mark.parametrize("normalize", [True, False]) +@pytest.mark.parametrize( + "normalize, name", + [ + (True, "proportion"), + (False, "count"), + ], +) @pytest.mark.parametrize("sort", [True, False]) @pytest.mark.parametrize("ascending", [True, False]) @pytest.mark.parametrize("dropna", [True, False]) def test_series_groupby_value_counts( - df, keys, bins, n, m, isort, normalize, sort, ascending, dropna + df, keys, bins, n, m, isort, normalize, name, sort, ascending, dropna ): def rebuild_index(df): arr = list(map(df.index.get_level_values, range(df.index.nlevels))) @@ -108,6 +114,8 @@ def rebuild_index(df): gr = df.groupby(keys, sort=isort) right = gr["3rd"].apply(Series.value_counts, **kwargs) right.index.names = right.index.names[:-1] + ["3rd"] + # https://github.com/pandas-dev/pandas/issues/49909 + right = right.rename(name) # have to sort on index because of unstable sort on values left, right = map(rebuild_index, (left, right)) # xref GH9212 @@ -138,6 +146,8 @@ def test_series_groupby_value_counts_with_grouper(): result = dfg["Food"].value_counts().sort_index() expected = dfg["Food"].apply(Series.value_counts).sort_index() expected.index.names = result.index.names + # https://github.com/pandas-dev/pandas/issues/49909 + expected = expected.rename("count") tm.assert_series_equal(result, expected) From f4734a264866e703d9730772a90060a501cf7f0d Mon Sep 17 00:00:00 2001 From: MarcoGorelli <> Date: Fri, 25 Nov 2022 14:34:09 +0000 Subject: [PATCH 10/23] :art: --- pandas/core/algorithms.py | 1 + pandas/tests/groupby/test_value_counts.py | 8 +------- 2 files changed, 2 insertions(+), 7 deletions(-) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 8fa62d7160ab3..53364f8dcc9c6 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -870,6 +870,7 @@ def value_counts( else: if is_extension_array_dtype(values): + # handle Categorical and sparse, result = Series(values)._values.value_counts(dropna=dropna).rename(name) result.index.name = index_name diff --git a/pandas/tests/groupby/test_value_counts.py b/pandas/tests/groupby/test_value_counts.py index 0aa57ab9d8bc6..21dac7138276a 100644 --- a/pandas/tests/groupby/test_value_counts.py +++ b/pandas/tests/groupby/test_value_counts.py @@ -82,13 +82,7 @@ def seed_df(seed_nans, n, m): @pytest.mark.slow @pytest.mark.parametrize("df, keys, bins, n, m", binned, ids=ids) @pytest.mark.parametrize("isort", [True, False]) -@pytest.mark.parametrize( - "normalize, name", - [ - (True, "proportion"), - (False, "count"), - ], -) +@pytest.mark.parametrize("normalize, name", [(True, "proportion"), (False, "count")]) @pytest.mark.parametrize("sort", [True, False]) @pytest.mark.parametrize("ascending", [True, False]) @pytest.mark.parametrize("dropna", [True, False]) From eb8a070ca336c70e6629244bcc9b771818d37d97 Mon Sep 17 00:00:00 2001 From: MarcoGorelli <> Date: Fri, 25 Nov 2022 14:56:07 +0000 Subject: [PATCH 11/23] fixup test --- pandas/tests/io/pytables/test_round_trip.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/tests/io/pytables/test_round_trip.py b/pandas/tests/io/pytables/test_round_trip.py index ce71e9e990364..c46db4e86fb4c 100644 --- a/pandas/tests/io/pytables/test_round_trip.py +++ b/pandas/tests/io/pytables/test_round_trip.py @@ -252,7 +252,8 @@ def test_table_values_dtypes_roundtrip(setup_path): "int64": 1, "object": 1, "datetime64[ns]": 2, - } + }, + name="count", ) result = result.sort_index() expected = expected.sort_index() From 354dd958f12f67b4219f040c25f7cf8dd5b10555 Mon Sep 17 00:00:00 2001 From: MarcoGorelli <> Date: Fri, 25 Nov 2022 15:51:23 +0000 Subject: [PATCH 12/23] fixup tests --- pandas/core/base.py | 8 +- pandas/core/frame.py | 12 +- pandas/core/groupby/generic.py | 6 +- pandas/tests/base/test_value_counts.py | 4 +- pandas/tests/frame/methods/test_asfreq.py | 4 +- pandas/tests/frame/test_api.py | 2 +- pandas/tests/frame/test_arithmetic.py | 8 +- pandas/tests/groupby/test_categorical.py | 2 +- .../tests/groupby/test_frame_value_counts.py | 106 ++++++++++++------ pandas/tests/reshape/test_get_dummies.py | 4 +- pandas/tests/test_algos.py | 58 ++++++---- 11 files changed, 140 insertions(+), 74 deletions(-) diff --git a/pandas/core/base.py b/pandas/core/base.py index fb563fd640cfd..04708d31bc4f9 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -948,7 +948,7 @@ def value_counts( 1.0 1 2.0 1 4.0 1 - dtype: int64 + Name: count, dtype: int64 With `normalize` set to `True`, returns the relative frequency by dividing all values by the sum of values. @@ -959,7 +959,7 @@ def value_counts( 1.0 0.2 2.0 0.2 4.0 0.2 - dtype: float64 + Name: proportion, dtype: float64 **bins** @@ -972,7 +972,7 @@ def value_counts( (0.996, 2.0] 2 (2.0, 3.0] 2 (3.0, 4.0] 1 - dtype: int64 + Name: count, dtype: int64 **dropna** @@ -984,7 +984,7 @@ def value_counts( 2.0 1 4.0 1 NaN 1 - dtype: int64 + Name: count, dtype: int64 """ return algorithms.value_counts( self, diff --git a/pandas/core/frame.py b/pandas/core/frame.py index a4550a531e918..02d3d1c35c933 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -7056,28 +7056,28 @@ def value_counts( 4 0 2 2 2 1 6 0 1 - dtype: int64 + Name: count, dtype: int64 >>> df.value_counts(sort=False) num_legs num_wings 2 2 1 4 0 2 6 0 1 - dtype: int64 + Name: count, dtype: int64 >>> df.value_counts(ascending=True) num_legs num_wings 2 2 1 6 0 1 4 0 2 - dtype: int64 + Name: count, dtype: int64 >>> df.value_counts(normalize=True) num_legs num_wings 4 0 0.50 2 2 0.25 6 0 0.25 - dtype: float64 + Name: proportion, dtype: float64 With `dropna` set to `False` we can also count rows with NA values. @@ -7094,7 +7094,7 @@ def value_counts( first_name middle_name Beth Louise 1 John Smith 1 - dtype: int64 + Name: count, dtype: int64 >>> df.value_counts(dropna=False) first_name middle_name @@ -7102,7 +7102,7 @@ def value_counts( Beth Louise 1 John Smith 1 NaN 1 - dtype: int64 + Name: count, dtype: int64 """ if subset is None: subset = self.columns.tolist() diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index aa03ee074e60d..02b8d350ce1a5 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -1923,7 +1923,7 @@ def value_counts( male low FR 2 US 1 medium FR 1 - dtype: int64 + Name: count, dtype: int64 >>> df.groupby('gender').value_counts(ascending=True) gender education country @@ -1932,7 +1932,7 @@ def value_counts( male low US 1 medium FR 1 low FR 2 - dtype: int64 + Name: count, dtype: int64 >>> df.groupby('gender').value_counts(normalize=True) gender education country @@ -1941,7 +1941,7 @@ def value_counts( male low FR 0.50 US 0.25 medium FR 0.25 - dtype: float64 + Name: proportion, dtype: float64 >>> df.groupby('gender', as_index=False).value_counts() gender education country count diff --git a/pandas/tests/base/test_value_counts.py b/pandas/tests/base/test_value_counts.py index 26a08f450265d..8aee3b4c2b01b 100644 --- a/pandas/tests/base/test_value_counts.py +++ b/pandas/tests/base/test_value_counts.py @@ -169,7 +169,7 @@ def test_value_counts_bins(index_or_series): exp1 = Series({Interval(0.997, 3.0): 4}, name="count") tm.assert_series_equal(res1, exp1) res1n = s1.value_counts(bins=1, normalize=True) - exp1n = Series({Interval(0.997, 3.0): 1.0}, name="count") + exp1n = Series({Interval(0.997, 3.0): 1.0}, name="proportion") tm.assert_series_equal(res1n, exp1n) if isinstance(s1, Index): @@ -193,7 +193,7 @@ def test_value_counts_bins(index_or_series): res4n = s1.value_counts(bins=4, normalize=True) exp4n = Series( - [0.5, 0.25, 0.25, 0], index=intervals.take([0, 1, 3, 2]), name="count" + [0.5, 0.25, 0.25, 0], index=intervals.take([0, 1, 3, 2]), name="proportion" ) tm.assert_series_equal(res4n, exp4n) diff --git a/pandas/tests/frame/methods/test_asfreq.py b/pandas/tests/frame/methods/test_asfreq.py index 07eacb5e89e3a..01c420c1e4274 100644 --- a/pandas/tests/frame/methods/test_asfreq.py +++ b/pandas/tests/frame/methods/test_asfreq.py @@ -49,7 +49,9 @@ def test_asfreq2(self, frame_or_series): if frame_or_series is Series: daily_ts = ts.asfreq("D", fill_value=-1) result = daily_ts.value_counts().sort_index() - expected = Series([60, 1, 1, 1], index=[-1.0, 2.0, 1.0, 0.0]).sort_index() + expected = Series( + [60, 1, 1, 1], index=[-1.0, 2.0, 1.0, 0.0], name="count" + ).sort_index() tm.assert_series_equal(result, expected) def test_asfreq_datetimeindex_empty(self, frame_or_series): diff --git a/pandas/tests/frame/test_api.py b/pandas/tests/frame/test_api.py index 1ab20c282b23a..78cadae3f206d 100644 --- a/pandas/tests/frame/test_api.py +++ b/pandas/tests/frame/test_api.py @@ -219,7 +219,7 @@ def test_with_datetimelikes(self): t = df.T result = t.dtypes.value_counts() - expected = Series({np.dtype("object"): 10}) + expected = Series({np.dtype("object"): 10}, name="count") tm.assert_series_equal(result, expected) def test_deepcopy(self, float_frame): diff --git a/pandas/tests/frame/test_arithmetic.py b/pandas/tests/frame/test_arithmetic.py index 545482e6d3dad..bf8638a9533fd 100644 --- a/pandas/tests/frame/test_arithmetic.py +++ b/pandas/tests/frame/test_arithmetic.py @@ -444,7 +444,9 @@ def test_df_flex_cmp_constant_return_types(self, opname): const = 2 result = getattr(df, opname)(const).dtypes.value_counts() - tm.assert_series_equal(result, Series([2], index=[np.dtype(bool)])) + tm.assert_series_equal( + result, Series([2], index=[np.dtype(bool)], name="count") + ) @pytest.mark.parametrize("opname", ["eq", "ne", "gt", "lt", "ge", "le"]) def test_df_flex_cmp_constant_return_types_empty(self, opname): @@ -454,7 +456,9 @@ def test_df_flex_cmp_constant_return_types_empty(self, opname): empty = df.iloc[:0] result = getattr(empty, opname)(const).dtypes.value_counts() - tm.assert_series_equal(result, Series([2], index=[np.dtype(bool)])) + tm.assert_series_equal( + result, Series([2], index=[np.dtype(bool)], name="count") + ) def test_df_flex_cmp_ea_dtype_with_ndarray_series(self): ii = pd.IntervalIndex.from_breaks([1, 2, 3]) diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py index 5c250618bf3c4..b77bd77f49d47 100644 --- a/pandas/tests/groupby/test_categorical.py +++ b/pandas/tests/groupby/test_categorical.py @@ -275,7 +275,7 @@ def test_sorting_with_different_categoricals(): index = Categorical(index, categories=["low", "med", "high"], ordered=True) index = [["A", "A", "A", "B", "B", "B"], CategoricalIndex(index)] index = MultiIndex.from_arrays(index, names=["group", "dose"]) - expected = Series([2] * 6, index=index, name="dose") + expected = Series([2] * 6, index=index, name="count") tm.assert_series_equal(result, expected) diff --git a/pandas/tests/groupby/test_frame_value_counts.py b/pandas/tests/groupby/test_frame_value_counts.py index 8255fbab40dce..1459fa5af7d61 100644 --- a/pandas/tests/groupby/test_frame_value_counts.py +++ b/pandas/tests/groupby/test_frame_value_counts.py @@ -51,6 +51,7 @@ def test_basic(education_df): ], names=["country", "gender", "education"], ), + name="proportion", ) tm.assert_series_equal(result, expected) @@ -60,7 +61,7 @@ def _frame_value_counts(df, keys, normalize, sort, ascending): @pytest.mark.parametrize("groupby", ["column", "array", "function"]) -@pytest.mark.parametrize("normalize", [True, False]) +@pytest.mark.parametrize("normalize, name", [(True, "proportion"), (False, "count")]) @pytest.mark.parametrize( "sort, ascending", [ @@ -72,7 +73,7 @@ def _frame_value_counts(df, keys, normalize, sort, ascending): @pytest.mark.parametrize("as_index", [True, False]) @pytest.mark.parametrize("frame", [True, False]) def test_against_frame_and_seriesgroupby( - education_df, groupby, normalize, sort, ascending, as_index, frame + education_df, groupby, normalize, name, sort, ascending, as_index, frame ): # test all parameters: # - Use column, array or function as by= parameter @@ -117,7 +118,7 @@ def test_against_frame_and_seriesgroupby( expected = gp["both"].value_counts( normalize=normalize, sort=sort, ascending=ascending ) - expected.name = None + expected.name = name if as_index: index_frame = expected.index.to_frame(index=False) index_frame["gender"] = index_frame["both"].str.split("-").str.get(0) @@ -176,16 +177,23 @@ def animals_df(): @pytest.mark.parametrize( - "sort, ascending, normalize, expected_data, expected_index", + "sort, ascending, normalize, name, expected_data, expected_index", [ - (False, None, False, [1, 2, 1], [(1, 1, 1), (2, 4, 6), (2, 0, 0)]), - (True, True, False, [1, 1, 2], [(1, 1, 1), (2, 6, 4), (2, 0, 0)]), - (True, False, False, [2, 1, 1], [(1, 1, 1), (4, 2, 6), (0, 2, 0)]), - (True, False, True, [0.5, 0.25, 0.25], [(1, 1, 1), (4, 2, 6), (0, 2, 0)]), + (False, None, False, "count", [1, 2, 1], [(1, 1, 1), (2, 4, 6), (2, 0, 0)]), + (True, True, False, "count", [1, 1, 2], [(1, 1, 1), (2, 6, 4), (2, 0, 0)]), + (True, False, False, "count", [2, 1, 1], [(1, 1, 1), (4, 2, 6), (0, 2, 0)]), + ( + True, + False, + True, + "proportion", + [0.5, 0.25, 0.25], + [(1, 1, 1), (4, 2, 6), (0, 2, 0)], + ), ], ) def test_data_frame_value_counts( - animals_df, sort, ascending, normalize, expected_data, expected_index + animals_df, sort, ascending, normalize, name, expected_data, expected_index ): # 3-way compare with :meth:`~DataFrame.value_counts` # Tests from frame/methods/test_value_counts.py @@ -197,6 +205,7 @@ def test_data_frame_value_counts( index=MultiIndex.from_arrays( expected_index, names=["key", "num_legs", "num_wings"] ), + name=name, ) tm.assert_series_equal(result_frame, expected) @@ -243,7 +252,7 @@ def test_dropna_combinations( for column in nulls_df.columns: columns[column] = [nulls_df[column][row] for row in expected_rows] index = MultiIndex.from_frame(columns) - expected = Series(data=expected_values, index=index) + expected = Series(data=expected_values, index=index, name="proportion") tm.assert_series_equal(result, expected) @@ -284,9 +293,9 @@ def names_with_nulls_df(nulls_fixture): ), ], ) -@pytest.mark.parametrize("normalize", [False, True]) +@pytest.mark.parametrize("normalize, name", [(False, "count"), (True, "proportion")]) def test_data_frame_value_counts_dropna( - names_with_nulls_df, dropna, normalize, expected_data, expected_index + names_with_nulls_df, dropna, normalize, name, expected_data, expected_index ): # GH 41334 # 3-way compare with :meth:`~DataFrame.value_counts` @@ -295,6 +304,7 @@ def test_data_frame_value_counts_dropna( expected = Series( data=expected_data, index=expected_index, + name=name, ) if normalize: expected /= float(len(expected_data)) @@ -311,17 +321,22 @@ def test_data_frame_value_counts_dropna( @pytest.mark.parametrize("as_index", [False, True]) @pytest.mark.parametrize("observed", [False, True]) @pytest.mark.parametrize( - "normalize, expected_data", + "normalize, name, expected_data", [ - (False, np.array([2, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0], dtype=np.int64)), + ( + False, + "count", + np.array([2, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0], dtype=np.int64), + ), ( True, + "proportion", np.array([0.5, 0.25, 0.25, 0.0, 0.0, 0.0, 0.5, 0.5, 0.0, 0.0, 0.0, 0.0]), ), ], ) def test_categorical_single_grouper_with_only_observed_categories( - education_df, as_index, observed, normalize, expected_data + education_df, as_index, observed, normalize, name, expected_data ): # Test single categorical grouper with only observed grouping categories @@ -353,6 +368,7 @@ def test_categorical_single_grouper_with_only_observed_categories( expected_series = Series( data=expected_data, index=expected_index, + name=name, ) for i in range(3): expected_series.index = expected_series.index.set_levels( @@ -369,7 +385,7 @@ def test_categorical_single_grouper_with_only_observed_categories( def assert_categorical_single_grouper( - education_df, as_index, observed, expected_index, normalize, expected_data + education_df, as_index, observed, expected_index, normalize, name, expected_data ): # Test single categorical grouper when non-groupers are also categorical education_df = education_df.copy().astype("category") @@ -386,6 +402,7 @@ def assert_categorical_single_grouper( expected_index, names=["country", "gender", "education"], ), + name=name, ) for i in range(3): index_level = CategoricalIndex(expected_series.index.levels[i]) @@ -398,25 +415,28 @@ def assert_categorical_single_grouper( if as_index: tm.assert_series_equal(result, expected_series) else: - expected = expected_series.reset_index( - name="proportion" if normalize else "count" - ) + expected = expected_series.reset_index(name=name) tm.assert_frame_equal(result, expected) @pytest.mark.parametrize("as_index", [True, False]) @pytest.mark.parametrize( - "normalize, expected_data", + "normalize, name, expected_data", [ - (False, np.array([2, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0], dtype=np.int64)), + ( + False, + "count", + np.array([2, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0], dtype=np.int64), + ), ( True, + "proportion", np.array([0.5, 0.25, 0.25, 0.0, 0.0, 0.0, 0.5, 0.5, 0.0, 0.0, 0.0, 0.0]), ), ], ) def test_categorical_single_grouper_observed_true( - education_df, as_index, normalize, expected_data + education_df, as_index, normalize, name, expected_data ): # GH#46357 @@ -441,22 +461,25 @@ def test_categorical_single_grouper_observed_true( observed=True, expected_index=expected_index, normalize=normalize, + name=name, expected_data=expected_data, ) @pytest.mark.parametrize("as_index", [True, False]) @pytest.mark.parametrize( - "normalize, expected_data", + "normalize, name, expected_data", [ ( False, + "count", np.array( [2, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=np.int64 ), ), ( True, + "proportion", np.array( [ 0.5, @@ -483,7 +506,7 @@ def test_categorical_single_grouper_observed_true( ], ) def test_categorical_single_grouper_observed_false( - education_df, as_index, normalize, expected_data + education_df, as_index, normalize, name, expected_data ): # GH#46357 @@ -514,6 +537,7 @@ def test_categorical_single_grouper_observed_false( observed=False, expected_index=expected_index, normalize=normalize, + name=name, expected_data=expected_data, ) @@ -552,18 +576,23 @@ def test_categorical_single_grouper_observed_false( ], ) @pytest.mark.parametrize( - "normalize, expected_data", + "normalize, name, expected_data", [ - (False, np.array([1, 0, 2, 0, 1, 0, 1, 0, 1, 0, 0, 0], dtype=np.int64)), + ( + False, + "count", + np.array([1, 0, 2, 0, 1, 0, 1, 0, 1, 0, 0, 0], dtype=np.int64), + ), ( True, + "proportion", # NaN values corresponds to non-observed groups np.array([1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0]), ), ], ) def test_categorical_multiple_groupers( - education_df, as_index, observed, expected_index, normalize, expected_data + education_df, as_index, observed, expected_index, normalize, name, expected_data ): # GH#46357 @@ -583,6 +612,7 @@ def test_categorical_multiple_groupers( expected_index, names=["country", "education", "gender"], ), + name=name, ) for i in range(2): expected_series.index = expected_series.index.set_levels( @@ -601,18 +631,23 @@ def test_categorical_multiple_groupers( @pytest.mark.parametrize("as_index", [False, True]) @pytest.mark.parametrize("observed", [False, True]) @pytest.mark.parametrize( - "normalize, expected_data", + "normalize, name, expected_data", [ - (False, np.array([2, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0], dtype=np.int64)), + ( + False, + "count", + np.array([2, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0], dtype=np.int64), + ), ( True, + "proportion", # NaN values corresponds to non-observed groups np.array([0.5, 0.25, 0.25, 0.0, 0.0, 0.0, 0.5, 0.5, 0.0, 0.0, 0.0, 0.0]), ), ], ) def test_categorical_non_groupers( - education_df, as_index, observed, normalize, expected_data + education_df, as_index, observed, normalize, name, expected_data ): # GH#46357 Test non-observed categories are included in the result, # regardless of `observed` @@ -643,6 +678,7 @@ def test_categorical_non_groupers( expected_index, names=["country", "gender", "education"], ), + name=name, ) for i in range(1, 3): expected_series.index = expected_series.index.set_levels( @@ -703,6 +739,7 @@ def test_column_label_duplicates(test, columns, expected_names, as_index): expected_data, names=expected_names, ), + name="count", ) tm.assert_series_equal(result, expected) else: @@ -736,7 +773,9 @@ def test_ambiguous_grouping(): df = DataFrame({"a": [1, 1]}) gb = df.groupby([1, 1]) result = gb.value_counts() - expected = Series([2], index=MultiIndex.from_tuples([[1, 1]], names=[None, "a"])) + expected = Series( + [2], index=MultiIndex.from_tuples([[1, 1]], names=[None, "a"]), name="count" + ) tm.assert_series_equal(result, expected) @@ -761,7 +800,9 @@ def test_subset(): df = DataFrame({"c1": ["a", "b", "c"], "c2": ["x", "y", "y"]}, index=[0, 1, 1]) result = df.groupby(level=0).value_counts(subset=["c2"]) expected = Series( - [1, 2], index=MultiIndex.from_arrays([[0, 1], ["x", "y"]], names=[None, "c2"]) + [1, 2], + index=MultiIndex.from_arrays([[0, 1], ["x", "y"]], names=[None, "c2"]), + name="count", ) tm.assert_series_equal(result, expected) @@ -779,5 +820,6 @@ def test_subset_duplicate_columns(): index=MultiIndex.from_arrays( [[0, 1], ["x", "y"], ["x", "y"]], names=[None, "c2", "c2"] ), + name="count", ) tm.assert_series_equal(result, expected) diff --git a/pandas/tests/reshape/test_get_dummies.py b/pandas/tests/reshape/test_get_dummies.py index 4345a357a0ba8..9366aa767143f 100644 --- a/pandas/tests/reshape/test_get_dummies.py +++ b/pandas/tests/reshape/test_get_dummies.py @@ -100,7 +100,7 @@ def test_get_dummies_basic_types(self, sparse, dtype): else: dtype_name = self.effective_dtype(dtype).name - expected = Series({dtype_name: 8}) + expected = Series({dtype_name: 8}, name="count") result = result.dtypes.value_counts() result.index = [str(i) for i in result.index] tm.assert_series_equal(result, expected) @@ -110,7 +110,7 @@ def test_get_dummies_basic_types(self, sparse, dtype): expected_counts = {"int64": 1, "object": 1} expected_counts[dtype_name] = 3 + expected_counts.get(dtype_name, 0) - expected = Series(expected_counts).sort_index() + expected = Series(expected_counts, name="count").sort_index() result = result.dtypes.value_counts() result.index = [str(i) for i in result.index] result = result.sort_index() diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index b982d247c2707..21c308c629034 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -1145,18 +1145,22 @@ def test_value_counts(self): result = algos.value_counts(factor) breaks = [-1.194, -0.535, 0.121, 0.777, 1.433] index = IntervalIndex.from_breaks(breaks).astype(CDT(ordered=True)) - expected = Series([1, 1, 1, 1], index=index) + expected = Series([1, 1, 1, 1], index=index, name="count") tm.assert_series_equal(result.sort_index(), expected.sort_index()) def test_value_counts_bins(self): s = [1, 2, 3, 4] result = algos.value_counts(s, bins=1) - expected = Series([4], index=IntervalIndex.from_tuples([(0.996, 4.0)])) + expected = Series( + [4], index=IntervalIndex.from_tuples([(0.996, 4.0)]), name="count" + ) tm.assert_series_equal(result, expected) result = algos.value_counts(s, bins=2, sort=False) expected = Series( - [2, 2], index=IntervalIndex.from_tuples([(0.996, 2.5), (2.5, 4.0)]) + [2, 2], + index=IntervalIndex.from_tuples([(0.996, 2.5), (2.5, 4.0)]), + name="count", ) tm.assert_series_equal(result, expected) @@ -1184,7 +1188,7 @@ def test_value_counts_nat(self): assert len(vc) == 1 assert len(vc_with_na) == 2 - exp_dt = Series({Timestamp("2014-01-01 00:00:00"): 1}) + exp_dt = Series({Timestamp("2014-01-01 00:00:00"): 1}, name="count") tm.assert_series_equal(algos.value_counts(dt), exp_dt) # TODO same for (timedelta) @@ -1206,7 +1210,7 @@ def test_value_counts_datetime_outofbounds(self): [datetime(3000, 1, 1), datetime(5000, 1, 1), datetime(6000, 1, 1)], dtype=object, ) - exp = Series([3, 2, 1], index=exp_index) + exp = Series([3, 2, 1], index=exp_index, name="count") tm.assert_series_equal(res, exp) # GH 12424 @@ -1217,7 +1221,9 @@ def test_value_counts_datetime_outofbounds(self): def test_categorical(self): s = Series(Categorical(list("aaabbc"))) result = s.value_counts() - expected = Series([3, 2, 1], index=CategoricalIndex(["a", "b", "c"])) + expected = Series( + [3, 2, 1], index=CategoricalIndex(["a", "b", "c"]), name="count" + ) tm.assert_series_equal(result, expected, check_index_type=True) @@ -1234,10 +1240,13 @@ def test_categorical_nans(self): expected = Series( [4, 3, 2], index=CategoricalIndex(["a", "b", "c"], categories=["a", "b", "c"]), + name="count", ) tm.assert_series_equal(result, expected, check_index_type=True) result = s.value_counts(dropna=False) - expected = Series([4, 3, 2, 1], index=CategoricalIndex(["a", "b", "c", np.nan])) + expected = Series( + [4, 3, 2, 1], index=CategoricalIndex(["a", "b", "c", np.nan]), name="count" + ) tm.assert_series_equal(result, expected, check_index_type=True) # out of order @@ -1249,8 +1258,11 @@ def test_categorical_nans(self): expected = Series( [4, 3, 2], index=CategoricalIndex( - ["a", "b", "c"], categories=["b", "a", "c"], ordered=True + ["a", "b", "c"], + categories=["b", "a", "c"], + ordered=True, ), + name="count", ) tm.assert_series_equal(result, expected, check_index_type=True) @@ -1260,6 +1272,7 @@ def test_categorical_nans(self): index=CategoricalIndex( ["a", "b", "c", np.nan], categories=["b", "a", "c"], ordered=True ), + name="count", ) tm.assert_series_equal(result, expected, check_index_type=True) @@ -1272,6 +1285,7 @@ def test_categorical_zeroes(self): index=Categorical( ["b", "a", "c", "d"], categories=list("abcd"), ordered=True ), + name="count", ) tm.assert_series_equal(result, expected, check_index_type=True) @@ -1280,37 +1294,37 @@ def test_dropna(self): tm.assert_series_equal( Series([True, True, False]).value_counts(dropna=True), - Series([2, 1], index=[True, False]), + Series([2, 1], index=[True, False], name="count"), ) tm.assert_series_equal( Series([True, True, False]).value_counts(dropna=False), - Series([2, 1], index=[True, False]), + Series([2, 1], index=[True, False], name="count"), ) tm.assert_series_equal( Series([True] * 3 + [False] * 2 + [None] * 5).value_counts(dropna=True), - Series([3, 2], index=Index([True, False], dtype=object)), + Series([3, 2], index=Index([True, False], dtype=object), name="count"), ) tm.assert_series_equal( Series([True] * 5 + [False] * 3 + [None] * 2).value_counts(dropna=False), - Series([5, 3, 2], index=[True, False, np.nan]), + Series([5, 3, 2], index=[True, False, np.nan], name="count"), ) tm.assert_series_equal( Series([10.3, 5.0, 5.0]).value_counts(dropna=True), - Series([2, 1], index=[5.0, 10.3]), + Series([2, 1], index=[5.0, 10.3], name="count"), ) tm.assert_series_equal( Series([10.3, 5.0, 5.0]).value_counts(dropna=False), - Series([2, 1], index=[5.0, 10.3]), + Series([2, 1], index=[5.0, 10.3], name="count"), ) tm.assert_series_equal( Series([10.3, 5.0, 5.0, None]).value_counts(dropna=True), - Series([2, 1], index=[5.0, 10.3]), + Series([2, 1], index=[5.0, 10.3], name="count"), ) result = Series([10.3, 10.3, 5.0, 5.0, 5.0, None]).value_counts(dropna=False) - expected = Series([3, 2, 1], index=[5.0, 10.3, np.nan]) + expected = Series([3, 2, 1], index=[5.0, 10.3, np.nan], name="count") tm.assert_series_equal(result, expected) @pytest.mark.parametrize("dtype", (np.float64, object, "M8[ns]")) @@ -1320,23 +1334,27 @@ def test_value_counts_normalized(self, dtype): s_typed = s.astype(dtype) result = s_typed.value_counts(normalize=True, dropna=False) expected = Series( - [0.5, 0.3, 0.2], index=Series([np.nan, 2.0, 1.0], dtype=dtype) + [0.5, 0.3, 0.2], + index=Series([np.nan, 2.0, 1.0], dtype=dtype), + name="proportion", ) tm.assert_series_equal(result, expected) result = s_typed.value_counts(normalize=True, dropna=True) - expected = Series([0.6, 0.4], index=Series([2.0, 1.0], dtype=dtype)) + expected = Series( + [0.6, 0.4], index=Series([2.0, 1.0], dtype=dtype), name="proportion" + ) tm.assert_series_equal(result, expected) def test_value_counts_uint64(self): arr = np.array([2**63], dtype=np.uint64) - expected = Series([1], index=[2**63]) + expected = Series([1], index=[2**63], name="count") result = algos.value_counts(arr) tm.assert_series_equal(result, expected) arr = np.array([-1, 2**63], dtype=object) - expected = Series([1, 1], index=[-1, 2**63]) + expected = Series([1, 1], index=[-1, 2**63], name="count") result = algos.value_counts(arr) tm.assert_series_equal(result, expected) From ee9fb43a95f0686489874b7f26a509b7e60f3071 Mon Sep 17 00:00:00 2001 From: MarcoGorelli <> Date: Fri, 25 Nov 2022 16:52:58 +0000 Subject: [PATCH 13/23] :label: typing --- pandas/core/groupby/generic.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 02b8d350ce1a5..499a84c629957 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -2019,7 +2019,7 @@ def value_counts( observed=self.observed, dropna=self.dropna, ) - result_series = cast(Series, gb.size().rename(name)) + result_series = cast(Series, gb.size()).rename(name) # GH-46357 Include non-observed categories # of non-grouping columns regardless of `observed` From 77b2bc7a067bab92339f8ea7652d5a07b605ce86 Mon Sep 17 00:00:00 2001 From: MarcoGorelli <> Date: Fri, 25 Nov 2022 17:03:44 +0000 Subject: [PATCH 14/23] :pencil: whatsnew --- doc/source/whatsnew/v2.0.0.rst | 40 ++++++++++++++++++++++++++++++++++ 1 file changed, 40 insertions(+) diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst index bc922bace053a..348a195f21ea4 100644 --- a/doc/source/whatsnew/v2.0.0.rst +++ b/doc/source/whatsnew/v2.0.0.rst @@ -207,6 +207,46 @@ a supported dtype: pd.Series(["2016-01-01"], dtype="datetime64[D]") +.. _whatsnew_200.api_breaking.value_counts: + +Value counts behaviour change +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +In past versions, when running :meth:`Series.value_counts`, the result would inherit +the original object's name, and the result index would be nameless. This would cause +confusion when resetting the index, and the column names would not correspond with the +column values: + +.. code-block:: ipython + + In [5]: df = pd.DataFrame({'animal': ['quetzal', 'quetzal', 'elk']}) + + In [6]: df['animal'].value_counts().reset_index() + Out[6]: + index animal + 0 quetzal 2 + 1 elk 1 + +Now, the result name will be ``'count'`` (or ``'proportion'`` if ``normalize=True`` was passed), +and the index will be named after the original object. + +*Previous behavior*: + +.. code-block:: ipython + + In [8]: pd.Series(['quetzal', 'quetzal', 'elk'], name='animal').value_counts() + Out[8]: + quetzal 2 + elk 1 + Name: animal, dtype: int64 + +*New behavior*: + +.. ipython:: python + + pd.Series(['quetzal', 'quetzal', 'elk'], name='animal').value_counts() + +Likewise for other ``value_counts`` methods (for example, :meth:`DataFrame.value_counts`). + .. _whatsnew_200.api_breaking.astype_to_unsupported_datetimelike: Disallow astype conversion to non-supported datetime64/timedelta64 dtypes From 766a583c925f842072b2eafdfef4591316844cef Mon Sep 17 00:00:00 2001 From: MarcoGorelli <> Date: Fri, 25 Nov 2022 17:10:31 +0000 Subject: [PATCH 15/23] rewrite whatsnew --- doc/source/whatsnew/v2.0.0.rst | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst index 348a195f21ea4..5516d9f7b46e1 100644 --- a/doc/source/whatsnew/v2.0.0.rst +++ b/doc/source/whatsnew/v2.0.0.rst @@ -233,17 +233,25 @@ and the index will be named after the original object. .. code-block:: ipython - In [8]: pd.Series(['quetzal', 'quetzal', 'elk'], name='animal').value_counts() + In [8]: ( + ...: pd.Series(['quetzal', 'quetzal', 'elk'], name='animal') + ...: .value_counts() + ...: .reset_index() + ...: ) Out[8]: - quetzal 2 - elk 1 - Name: animal, dtype: int64 + index animal + 0 quetzal 2 + 1 elk 1 *New behavior*: .. ipython:: python - pd.Series(['quetzal', 'quetzal', 'elk'], name='animal').value_counts() + ( + pd.Series(['quetzal', 'quetzal', 'elk'], name='animal') + .value_counts() + .reset_index() + ) Likewise for other ``value_counts`` methods (for example, :meth:`DataFrame.value_counts`). From 53608b3adb58c3892907517b43ac7b8f527f3913 Mon Sep 17 00:00:00 2001 From: MarcoGorelli <> Date: Fri, 25 Nov 2022 17:11:52 +0000 Subject: [PATCH 16/23] add back missing line --- doc/source/whatsnew/v2.0.0.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst index 5516d9f7b46e1..f8be2efb5cf00 100644 --- a/doc/source/whatsnew/v2.0.0.rst +++ b/doc/source/whatsnew/v2.0.0.rst @@ -238,6 +238,7 @@ and the index will be named after the original object. ...: .value_counts() ...: .reset_index() ...: ) + ...: Out[8]: index animal 0 quetzal 2 From 19633a458e68d94a25b40563e3eabfe65ba3323c Mon Sep 17 00:00:00 2001 From: MarcoGorelli <> Date: Fri, 25 Nov 2022 17:13:05 +0000 Subject: [PATCH 17/23] shorten --- doc/source/whatsnew/v2.0.0.rst | 13 +------------ 1 file changed, 1 insertion(+), 12 deletions(-) diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst index f8be2efb5cf00..f9322a2338eab 100644 --- a/doc/source/whatsnew/v2.0.0.rst +++ b/doc/source/whatsnew/v2.0.0.rst @@ -214,18 +214,7 @@ Value counts behaviour change In past versions, when running :meth:`Series.value_counts`, the result would inherit the original object's name, and the result index would be nameless. This would cause confusion when resetting the index, and the column names would not correspond with the -column values: - -.. code-block:: ipython - - In [5]: df = pd.DataFrame({'animal': ['quetzal', 'quetzal', 'elk']}) - - In [6]: df['animal'].value_counts().reset_index() - Out[6]: - index animal - 0 quetzal 2 - 1 elk 1 - +column values. Now, the result name will be ``'count'`` (or ``'proportion'`` if ``normalize=True`` was passed), and the index will be named after the original object. From c2c0f383f228b24a1fdebcf7198319ee333784ba Mon Sep 17 00:00:00 2001 From: MarcoGorelli <> Date: Fri, 9 Dec 2022 10:12:55 +0000 Subject: [PATCH 18/23] pin name, simplify whatsnew example, reference issue, retitle --- doc/source/whatsnew/v2.0.0.rst | 28 ++++++++++------------------ pandas/core/algorithms.py | 3 ++- pandas/core/groupby/generic.py | 3 ++- 3 files changed, 14 insertions(+), 20 deletions(-) diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst index b4a0fa2758090..03abdb3673a4a 100644 --- a/doc/source/whatsnew/v2.0.0.rst +++ b/doc/source/whatsnew/v2.0.0.rst @@ -230,39 +230,31 @@ a supported dtype: .. _whatsnew_200.api_breaking.value_counts: -Value counts behaviour change -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +Value counts sets the resulting name to ``count`` +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ In past versions, when running :meth:`Series.value_counts`, the result would inherit the original object's name, and the result index would be nameless. This would cause confusion when resetting the index, and the column names would not correspond with the column values. Now, the result name will be ``'count'`` (or ``'proportion'`` if ``normalize=True`` was passed), -and the index will be named after the original object. +and the index will be named after the original object (:issue:`49497`). *Previous behavior*: .. code-block:: ipython - In [8]: ( - ...: pd.Series(['quetzal', 'quetzal', 'elk'], name='animal') - ...: .value_counts() - ...: .reset_index() - ...: ) - ...: - Out[8]: - index animal - 0 quetzal 2 - 1 elk 1 + In [8]: pd.Series(['quetzal', 'quetzal', 'elk'], name='animal').value_counts() + + Out[2]: + quetzal 2 + elk 1 + Name: animal, dtype: int64 *New behavior*: .. ipython:: python - ( - pd.Series(['quetzal', 'quetzal', 'elk'], name='animal') - .value_counts() - .reset_index() - ) + pd.Series(['quetzal', 'quetzal', 'elk'], name='animal').value_counts() Likewise for other ``value_counts`` methods (for example, :meth:`DataFrame.value_counts`). diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index bdbc85d8a923f..b30b4d5bca045 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -860,7 +860,8 @@ def value_counts( raise TypeError("bins argument only works with numeric data.") from err # count, remove nulls (from the index), and but the bins - result = ii.value_counts(dropna=dropna).rename(name) + result = ii.value_counts(dropna=dropna) + result.name = name result = result[result.index.notna()] result.index = result.index.astype("interval") result = result.sort_index() diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index db75e892c3580..e7b771febd1e1 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -626,7 +626,8 @@ def value_counts( sort=sort, ascending=ascending, bins=bins, - ).rename(name) + ) + ser.name = name ser.index.names = index_names return ser From 29418265fdc1cd0344a69cc4b8610c5b0ca6482f Mon Sep 17 00:00:00 2001 From: MarcoGorelli <> Date: Sun, 18 Dec 2022 09:14:18 +0000 Subject: [PATCH 19/23] avoid rename --- pandas/core/algorithms.py | 3 ++- pandas/core/frame.py | 3 ++- pandas/core/groupby/generic.py | 3 ++- 3 files changed, 6 insertions(+), 3 deletions(-) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index b30b4d5bca045..0ac06a47904d3 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -878,7 +878,8 @@ def value_counts( if is_extension_array_dtype(values): # handle Categorical and sparse, - result = Series(values)._values.value_counts(dropna=dropna).rename(name) + result = Series(values)._values.value_counts(dropna=dropna) + result.name = name result.index.name = index_name counts = result._values diff --git a/pandas/core/frame.py b/pandas/core/frame.py index ef3e8af98d8da..2d17271c47382 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -6969,7 +6969,8 @@ def value_counts( subset = self.columns.tolist() name = "proportion" if normalize else "count" - counts = self.groupby(subset, dropna=dropna).grouper.size().rename(name) + counts = self.groupby(subset, dropna=dropna).grouper.size() + counts.name = name if sort: counts = counts.sort_values(ascending=ascending) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 5da69ae5b94ef..fe62d659ab41e 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -2087,7 +2087,8 @@ def value_counts( observed=self.observed, dropna=self.dropna, ) - result_series = cast(Series, gb.size()).rename(name) + result_series = cast(Series, gb.size()) + result_series.name = name # GH-46357 Include non-observed categories # of non-grouping columns regardless of `observed` From 1ac51aba50badae8fb2cdf827def83e5244d4437 Mon Sep 17 00:00:00 2001 From: MarcoGorelli <> Date: Mon, 9 Jan 2023 17:55:15 +0000 Subject: [PATCH 20/23] fixup new test --- pandas/tests/groupby/test_frame_value_counts.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/groupby/test_frame_value_counts.py b/pandas/tests/groupby/test_frame_value_counts.py index 4117b0b0aa3e1..0f026177accce 100644 --- a/pandas/tests/groupby/test_frame_value_counts.py +++ b/pandas/tests/groupby/test_frame_value_counts.py @@ -859,5 +859,5 @@ def test_value_counts_time_grouper(utc): codes=[[0, 1, 1, 2, 2, 3], range(6), [0, 0, 1, 2, 2, 3]], names=["Datetime", "Timestamp", "Food"], ) - expected = Series(1, index=index) + expected = Series(1, index=index, name="count") tm.assert_series_equal(result, expected) From cacf0100e6100a071b931500b07f2f65ad495d5d Mon Sep 17 00:00:00 2001 From: MarcoGorelli <> Date: Fri, 13 Jan 2023 14:56:03 +0000 Subject: [PATCH 21/23] adjust new path --- pandas/core/groupby/groupby.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 431b23023b094..a2b8438b96bff 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -2178,6 +2178,7 @@ def _value_counts( raise NotImplementedError( "DataFrameGroupBy.value_counts only handles axis=0" ) + name = "proportion" if normalize else "count" with self._group_selection_context(): df = self.obj @@ -2186,8 +2187,8 @@ def _value_counts( grouping.name for grouping in self.grouper.groupings if grouping.in_axis } if isinstance(self._selected_obj, Series): - name = self._selected_obj.name - keys = [] if name in in_axis_names else [self._selected_obj] + _name = self._selected_obj.name + keys = [] if _name in in_axis_names else [self._selected_obj] else: unique_cols = set(self._selected_obj.columns) if subset is not None: @@ -2210,8 +2211,8 @@ def _value_counts( keys = [ # Can't use .values because the column label needs to be preserved self._selected_obj.iloc[:, idx] - for idx, name in enumerate(self._selected_obj.columns) - if name not in in_axis_names and name in subsetted + for idx, _name in enumerate(self._selected_obj.columns) + if _name not in in_axis_names and _name in subsetted ] groupings = list(self.grouper.groupings) @@ -2234,6 +2235,7 @@ def _value_counts( dropna=self.dropna, ) result_series = cast(Series, gb.size()) + result_series.name = name # GH-46357 Include non-observed categories # of non-grouping columns regardless of `observed` @@ -2277,7 +2279,6 @@ def _value_counts( result = result_series else: # Convert to frame - name = "proportion" if normalize else "count" index = result_series.index columns = com.fill_missing_names(index.names) if name in columns: From 416ca2beb65c34df2155173eafe55530ba5da72b Mon Sep 17 00:00:00 2001 From: MarcoGorelli <> Date: Fri, 13 Jan 2023 15:11:15 +0000 Subject: [PATCH 22/23] fixup --- pandas/core/groupby/generic.py | 4 +++- pandas/tests/groupby/test_value_counts.py | 2 +- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 74871ceea0da9..dff415b2f7b5a 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -644,11 +644,14 @@ def value_counts( bins=None, dropna: bool = True, ) -> Series: + name = "proportion" if normalize else "count" + if bins is None: result = self._value_counts( normalize=normalize, sort=sort, ascending=ascending, dropna=dropna ) assert isinstance(result, Series) + result.name = name return result from pandas.core.reshape.merge import get_join_indexers @@ -658,7 +661,6 @@ def value_counts( val = self.obj._values index_names = self.grouper.names + [self.obj.name] - name = "proportion" if normalize else "count" if is_categorical_dtype(val.dtype) or ( bins is not None and not np.iterable(bins) diff --git a/pandas/tests/groupby/test_value_counts.py b/pandas/tests/groupby/test_value_counts.py index e172423d5403f..5ece46a42457f 100644 --- a/pandas/tests/groupby/test_value_counts.py +++ b/pandas/tests/groupby/test_value_counts.py @@ -220,7 +220,7 @@ def test_series_groupby_value_counts_no_sort(): codes=[[0, 1, 0, 1, 1], [0, 0, 1, 0, 1], [0, 1, 2, 0, 2]], names=["country", "gender", "education"], ) - expected = Series([1, 1, 1, 2, 1], index=index, name="education") + expected = Series([1, 1, 1, 2, 1], index=index, name="count") tm.assert_series_equal(result, expected) From b4df34acce8e39e0da3f3855bb0d61af00b603b4 Mon Sep 17 00:00:00 2001 From: MarcoGorelli <> Date: Fri, 13 Jan 2023 15:18:31 +0000 Subject: [PATCH 23/23] remove outdated comment --- pandas/core/algorithms.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index d39de8a7e2eeb..225c1dd91bfd9 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -891,7 +891,6 @@ def value_counts( .groupby(level=levels, dropna=dropna) .size() ) - # TODO: allow index names to remain (see discussion in GH49497) result.index.names = values.names counts = result._values