Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

API: various .value_counts() result in different names / indices #49912

Merged
merged 36 commits into from
Feb 1, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
36 commits
Select commit Hold shift + click to select a range
0e50e19
sort out series / index case
Nov 25, 2022
214cc34
getting there...
Nov 25, 2022
c1fa208
only just fixed up another file!
Nov 25, 2022
d872960
more fixups
Nov 25, 2022
f7cf72a
getting there
Nov 25, 2022
6e281e1
wip, do this apply thing separately
Nov 25, 2022
0fc0bec
fixup
Nov 25, 2022
df6a108
more fixup
Nov 25, 2022
94187e4
tmp fixup per gh49909
Nov 25, 2022
f4734a2
:art:
Nov 25, 2022
eb8a070
fixup test
Nov 25, 2022
354dd95
fixup tests
Nov 25, 2022
ee9fb43
:label: typing
Nov 25, 2022
77b2bc7
:pencil: whatsnew
Nov 25, 2022
766a583
rewrite whatsnew
Nov 25, 2022
53608b3
add back missing line
Nov 25, 2022
19633a4
shorten
Nov 25, 2022
30a58e0
Merge remote-tracking branch 'upstream/main' into just-change-value-c…
Dec 9, 2022
c2c0f38
pin name, simplify whatsnew example, reference issue, retitle
Dec 9, 2022
33f681e
Merge branch 'main' into just-change-value-counts
MarcoGorelli Dec 11, 2022
e24265a
Merge remote-tracking branch 'upstream/main' into just-change-value-c…
Dec 18, 2022
2941826
avoid rename
Dec 18, 2022
2d40eac
Merge branch 'just-change-value-counts' of github.com:MarcoGorelli/pa…
Dec 18, 2022
ec48816
Merge branch 'main' into just-change-value-counts
MarcoGorelli Dec 27, 2022
016ddbb
Merge remote-tracking branch 'upstream/main' into just-change-value-c…
Dec 30, 2022
1bf010e
Merge branch 'main' into just-change-value-counts
MarcoGorelli Jan 4, 2023
8b3e366
Merge remote-tracking branch 'upstream/main' into just-change-value-c…
Jan 9, 2023
1ac51ab
fixup new test
Jan 9, 2023
6696d73
Merge remote-tracking branch 'upstream/main' into just-change-value-c…
Jan 11, 2023
d80ad10
Merge remote-tracking branch 'upstream/main' into just-change-value-c…
Jan 13, 2023
cacf010
adjust new path
Jan 13, 2023
8323c21
Merge remote-tracking branch 'upstream/main' into just-change-value-c…
Jan 13, 2023
416ca2b
fixup
Jan 13, 2023
b4df34a
remove outdated comment
Jan 13, 2023
229132b
Merge remote-tracking branch 'upstream/main' into just-change-value-c…
Jan 13, 2023
90d6afa
Merge remote-tracking branch 'upstream/main' into just-change-value-c…
Jan 31, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
30 changes: 30 additions & 0 deletions doc/source/whatsnew/v2.0.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -330,6 +330,36 @@ a supported dtype:

pd.Series(["2016-01-01"], dtype="datetime64[D]")

.. _whatsnew_200.api_breaking.value_counts:

Value counts sets the resulting name to ``count``
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
In past versions, when running :meth:`Series.value_counts`, the result would inherit
the original object's name, and the result index would be nameless. This would cause
confusion when resetting the index, and the column names would not correspond with the
column values.
Now, the result name will be ``'count'`` (or ``'proportion'`` if ``normalize=True`` was passed),
and the index will be named after the original object (:issue:`49497`).

*Previous behavior*:

.. code-block:: ipython
jbrockmendel marked this conversation as resolved.
Show resolved Hide resolved

In [8]: pd.Series(['quetzal', 'quetzal', 'elk'], name='animal').value_counts()

Out[2]:
quetzal 2
elk 1
Name: animal, dtype: int64

*New behavior*:

.. ipython:: python

pd.Series(['quetzal', 'quetzal', 'elk'], name='animal').value_counts()

Likewise for other ``value_counts`` methods (for example, :meth:`DataFrame.value_counts`).

.. _whatsnew_200.api_breaking.astype_to_unsupported_datetimelike:

Disallow astype conversion to non-supported datetime64/timedelta64 dtypes
Expand Down
15 changes: 11 additions & 4 deletions pandas/core/algorithms.py
Original file line number Diff line number Diff line change
Expand Up @@ -847,7 +847,8 @@ def value_counts(
Series,
)

name = getattr(values, "name", None)
index_name = getattr(values, "name", None)
name = "proportion" if normalize else "count"

if bins is not None:
from pandas.core.reshape.tile import cut
Expand All @@ -860,6 +861,7 @@ def value_counts(

# count, remove nulls (from the index), and but the bins
result = ii.value_counts(dropna=dropna)
result.name = name
result = result[result.index.notna()]
result.index = result.index.astype("interval")
result = result.sort_index()
Expand All @@ -878,14 +880,18 @@ def value_counts(
# handle Categorical and sparse,
result = Series(values)._values.value_counts(dropna=dropna)
result.name = name
result.index.name = index_name
counts = result._values

elif isinstance(values, ABCMultiIndex):
# GH49558
levels = list(range(values.nlevels))
result = Series(index=values).groupby(level=levels, dropna=dropna).size()
# TODO: allow index names to remain (see discussion in GH49497)
result.index.names = [None] * values.nlevels
result = (
Series(index=values, name=name)
.groupby(level=levels, dropna=dropna)
.size()
)
result.index.names = values.names
counts = result._values

else:
Expand All @@ -899,6 +905,7 @@ def value_counts(
idx = Index(keys)
if idx.dtype == bool and keys.dtype == object:
idx = idx.astype(object)
idx.name = index_name

result = Series(counts, index=idx, name=name)

Expand Down
2 changes: 1 addition & 1 deletion pandas/core/arrays/arrow/array.py
Original file line number Diff line number Diff line change
Expand Up @@ -934,7 +934,7 @@ def value_counts(self, dropna: bool = True) -> Series:

index = Index(type(self)(values))

return Series(counts, index=index).astype("Int64")
return Series(counts, index=index, name="count").astype("Int64")

@classmethod
def _concat_same_type(
Expand Down
2 changes: 1 addition & 1 deletion pandas/core/arrays/categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -1499,7 +1499,7 @@ def value_counts(self, dropna: bool = True) -> Series:
ix = coerce_indexer_dtype(ix, self.dtype.categories)
ix = self._from_backing_data(ix)

return Series(count, index=CategoricalIndex(ix), dtype="int64")
return Series(count, index=CategoricalIndex(ix), dtype="int64", name="count")

# error: Argument 2 of "_empty" is incompatible with supertype
# "NDArrayBackedExtensionArray"; supertype defines the argument type as
Expand Down
4 changes: 2 additions & 2 deletions pandas/core/arrays/masked.py
Original file line number Diff line number Diff line change
Expand Up @@ -996,7 +996,7 @@ def value_counts(self, dropna: bool = True) -> Series:
)

if dropna:
res = Series(value_counts, index=keys)
res = Series(value_counts, index=keys, name="count")
res.index = res.index.astype(self.dtype)
res = res.astype("Int64")
return res
Expand All @@ -1012,7 +1012,7 @@ def value_counts(self, dropna: bool = True) -> Series:
mask = np.zeros(len(counts), dtype="bool")
counts_array = IntegerArray(counts, mask)

return Series(counts_array, index=index)
return Series(counts_array, index=index, name="count")

@doc(ExtensionArray.equals)
def equals(self, other) -> bool:
Expand Down
8 changes: 4 additions & 4 deletions pandas/core/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -966,7 +966,7 @@ def value_counts(
1.0 1
2.0 1
4.0 1
dtype: int64
Name: count, dtype: int64

With `normalize` set to `True`, returns the relative frequency by
dividing all values by the sum of values.
Expand All @@ -977,7 +977,7 @@ def value_counts(
1.0 0.2
2.0 0.2
4.0 0.2
dtype: float64
Name: proportion, dtype: float64

**bins**

Expand All @@ -990,7 +990,7 @@ def value_counts(
(0.996, 2.0] 2
(2.0, 3.0] 2
(3.0, 4.0] 1
dtype: int64
Name: count, dtype: int64

**dropna**

Expand All @@ -1002,7 +1002,7 @@ def value_counts(
2.0 1
4.0 1
NaN 1
dtype: int64
Name: count, dtype: int64
"""
return algorithms.value_counts(
self,
Expand Down
14 changes: 8 additions & 6 deletions pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -7003,28 +7003,28 @@ def value_counts(
4 0 2
2 2 1
6 0 1
dtype: int64
Name: count, dtype: int64

>>> df.value_counts(sort=False)
num_legs num_wings
2 2 1
4 0 2
6 0 1
dtype: int64
Name: count, dtype: int64

>>> df.value_counts(ascending=True)
num_legs num_wings
2 2 1
6 0 1
4 0 2
dtype: int64
Name: count, dtype: int64

>>> df.value_counts(normalize=True)
num_legs num_wings
4 0 0.50
2 2 0.25
6 0 0.25
dtype: float64
Name: proportion, dtype: float64

With `dropna` set to `False` we can also count rows with NA values.

Expand All @@ -7041,20 +7041,22 @@ def value_counts(
first_name middle_name
Beth Louise 1
John Smith 1
dtype: int64
Name: count, dtype: int64

>>> df.value_counts(dropna=False)
first_name middle_name
Anne NaN 1
Beth Louise 1
John Smith 1
NaN 1
dtype: int64
Name: count, dtype: int64
"""
if subset is None:
subset = self.columns.tolist()

name = "proportion" if normalize else "count"
counts = self.groupby(subset, dropna=dropna).grouper.size()
counts.name = name

if sort:
counts = counts.sort_values(ascending=ascending)
Expand Down
21 changes: 13 additions & 8 deletions pandas/core/groupby/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -666,10 +666,13 @@ def value_counts(
bins=None,
dropna: bool = True,
) -> Series | DataFrame:
name = "proportion" if normalize else "count"

if bins is None:
result = self._value_counts(
normalize=normalize, sort=sort, ascending=ascending, dropna=dropna
)
result.name = name
return result

from pandas.core.reshape.merge import get_join_indexers
Expand All @@ -678,7 +681,7 @@ def value_counts(
ids, _, _ = self.grouper.group_info
val = self.obj._values

names = self.grouper.names + [self.obj.name]
index_names = self.grouper.names + [self.obj.name]

if is_categorical_dtype(val.dtype) or (
bins is not None and not np.iterable(bins)
Expand All @@ -693,7 +696,8 @@ def value_counts(
ascending=ascending,
bins=bins,
)
ser.index.names = names
ser.name = name
ser.index.names = index_names
return ser

# groupby removes null keys from groupings
Expand Down Expand Up @@ -803,13 +807,14 @@ def build_codes(lev_codes: np.ndarray) -> np.ndarray:
codes = [build_codes(lev_codes) for lev_codes in codes[:-1]]
codes.append(left[-1])

mi = MultiIndex(levels=levels, codes=codes, names=names, verify_integrity=False)
mi = MultiIndex(
levels=levels, codes=codes, names=index_names, verify_integrity=False
)

if is_integer_dtype(out.dtype):
out = ensure_int64(out)
result = self.obj._constructor(out, index=mi, name=self.obj.name)
result = self.obj._constructor(out, index=mi, name=name)
if not self.as_index:
result.name = "proportion" if normalize else "count"
result = result.reset_index()
return result

Expand Down Expand Up @@ -2205,7 +2210,7 @@ def value_counts(
male low FR 2
US 1
medium FR 1
dtype: int64
Name: count, dtype: int64

>>> df.groupby('gender').value_counts(ascending=True)
gender education country
Expand All @@ -2214,7 +2219,7 @@ def value_counts(
male low US 1
medium FR 1
low FR 2
dtype: int64
Name: count, dtype: int64

>>> df.groupby('gender').value_counts(normalize=True)
gender education country
Expand All @@ -2223,7 +2228,7 @@ def value_counts(
male low FR 0.50
US 0.25
medium FR 0.25
dtype: float64
Name: proportion, dtype: float64

>>> df.groupby('gender', as_index=False).value_counts()
gender education country count
Expand Down
11 changes: 6 additions & 5 deletions pandas/core/groupby/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -2205,6 +2205,7 @@ def _value_counts(
raise NotImplementedError(
"DataFrameGroupBy.value_counts only handles axis=0"
)
name = "proportion" if normalize else "count"

with self._group_selection_context():
df = self.obj
Expand All @@ -2213,8 +2214,8 @@ def _value_counts(
grouping.name for grouping in self.grouper.groupings if grouping.in_axis
}
if isinstance(self._selected_obj, Series):
name = self._selected_obj.name
keys = [] if name in in_axis_names else [self._selected_obj]
_name = self._selected_obj.name
keys = [] if _name in in_axis_names else [self._selected_obj]
else:
unique_cols = set(self._selected_obj.columns)
if subset is not None:
Expand All @@ -2237,8 +2238,8 @@ def _value_counts(
keys = [
# Can't use .values because the column label needs to be preserved
self._selected_obj.iloc[:, idx]
for idx, name in enumerate(self._selected_obj.columns)
if name not in in_axis_names and name in subsetted
for idx, _name in enumerate(self._selected_obj.columns)
if _name not in in_axis_names and _name in subsetted
]

groupings = list(self.grouper.groupings)
Expand All @@ -2261,6 +2262,7 @@ def _value_counts(
dropna=self.dropna,
)
result_series = cast(Series, gb.size())
result_series.name = name

# GH-46357 Include non-observed categories
# of non-grouping columns regardless of `observed`
Expand Down Expand Up @@ -2304,7 +2306,6 @@ def _value_counts(
result = result_series
else:
# Convert to frame
name = "proportion" if normalize else "count"
index = result_series.index
columns = com.fill_missing_names(index.names)
if name in columns:
Expand Down
6 changes: 3 additions & 3 deletions pandas/tests/arrays/boolean/test_function.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,20 +92,20 @@ def test_ufunc_reduce_raises(values):
def test_value_counts_na():
arr = pd.array([True, False, pd.NA], dtype="boolean")
result = arr.value_counts(dropna=False)
expected = pd.Series([1, 1, 1], index=arr, dtype="Int64")
expected = pd.Series([1, 1, 1], index=arr, dtype="Int64", name="count")
assert expected.index.dtype == arr.dtype
tm.assert_series_equal(result, expected)

result = arr.value_counts(dropna=True)
expected = pd.Series([1, 1], index=arr[:-1], dtype="Int64")
expected = pd.Series([1, 1], index=arr[:-1], dtype="Int64", name="count")
assert expected.index.dtype == arr.dtype
tm.assert_series_equal(result, expected)


def test_value_counts_with_normalize():
ser = pd.Series([True, False, pd.NA], dtype="boolean")
result = ser.value_counts(normalize=True)
expected = pd.Series([1, 1], index=ser[:-1], dtype="Float64") / 2
expected = pd.Series([1, 1], index=ser[:-1], dtype="Float64", name="proportion") / 2
assert expected.index.dtype == "boolean"
tm.assert_series_equal(result, expected)

Expand Down
8 changes: 4 additions & 4 deletions pandas/tests/arrays/floating/test_function.py
Original file line number Diff line number Diff line change
Expand Up @@ -102,11 +102,11 @@ def test_value_counts_na():
result = arr.value_counts(dropna=False)
idx = pd.Index([0.1, 0.2, pd.NA], dtype=arr.dtype)
assert idx.dtype == arr.dtype
expected = pd.Series([2, 1, 1], index=idx, dtype="Int64")
expected = pd.Series([2, 1, 1], index=idx, dtype="Int64", name="count")
tm.assert_series_equal(result, expected)

result = arr.value_counts(dropna=True)
expected = pd.Series([2, 1], index=idx[:-1], dtype="Int64")
expected = pd.Series([2, 1], index=idx[:-1], dtype="Int64", name="count")
tm.assert_series_equal(result, expected)


Expand All @@ -115,14 +115,14 @@ def test_value_counts_empty():
result = ser.value_counts()
idx = pd.Index([], dtype="Float64")
assert idx.dtype == "Float64"
expected = pd.Series([], index=idx, dtype="Int64")
expected = pd.Series([], index=idx, dtype="Int64", name="count")
tm.assert_series_equal(result, expected)


def test_value_counts_with_normalize():
ser = pd.Series([0.1, 0.2, 0.1, pd.NA], dtype="Float64")
result = ser.value_counts(normalize=True)
expected = pd.Series([2, 1], index=ser[:2], dtype="Float64") / 3
expected = pd.Series([2, 1], index=ser[:2], dtype="Float64", name="proportion") / 3
assert expected.index.dtype == ser.dtype
tm.assert_series_equal(result, expected)

Expand Down
Loading