-
-
Notifications
You must be signed in to change notification settings - Fork 18.3k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
API: various .value_counts() result in different names / indices #49912
Changes from 17 commits
0e50e19
214cc34
c1fa208
d872960
f7cf72a
6e281e1
0fc0bec
df6a108
94187e4
f4734a2
eb8a070
354dd95
ee9fb43
77b2bc7
766a583
53608b3
19633a4
30a58e0
c2c0f38
33f681e
e24265a
2941826
2d40eac
ec48816
016ddbb
1bf010e
8b3e366
1ac51ab
6696d73
d80ad10
cacf010
8323c21
416ca2b
b4df34a
229132b
90d6afa
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -207,6 +207,44 @@ a supported dtype: | |
|
||
pd.Series(["2016-01-01"], dtype="datetime64[D]") | ||
|
||
.. _whatsnew_200.api_breaking.value_counts: | ||
|
||
Value counts behaviour change | ||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | ||
In past versions, when running :meth:`Series.value_counts`, the result would inherit | ||
the original object's name, and the result index would be nameless. This would cause | ||
confusion when resetting the index, and the column names would not correspond with the | ||
column values. | ||
Now, the result name will be ``'count'`` (or ``'proportion'`` if ``normalize=True`` was passed), | ||
and the index will be named after the original object. | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Could you add the issue number here? |
||
|
||
*Previous behavior*: | ||
|
||
.. code-block:: ipython | ||
jbrockmendel marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
||
In [8]: ( | ||
...: pd.Series(['quetzal', 'quetzal', 'elk'], name='animal') | ||
...: .value_counts() | ||
...: .reset_index() | ||
...: ) | ||
...: | ||
Out[8]: | ||
index animal | ||
0 quetzal 2 | ||
1 elk 1 | ||
|
||
*New behavior*: | ||
|
||
.. ipython:: python | ||
|
||
( | ||
pd.Series(['quetzal', 'quetzal', 'elk'], name='animal') | ||
.value_counts() | ||
.reset_index() | ||
) | ||
|
||
Likewise for other ``value_counts`` methods (for example, :meth:`DataFrame.value_counts`). | ||
|
||
.. _whatsnew_200.api_breaking.astype_to_unsupported_datetimelike: | ||
|
||
Disallow astype conversion to non-supported datetime64/timedelta64 dtypes | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -611,7 +611,8 @@ def value_counts( | |
ids, _, _ = self.grouper.group_info | ||
val = self.obj._values | ||
|
||
names = self.grouper.names + [self.obj.name] | ||
index_names = self.grouper.names + [self.obj.name] | ||
name = "proportion" if normalize else "count" | ||
|
||
if is_categorical_dtype(val.dtype) or ( | ||
bins is not None and not np.iterable(bins) | ||
|
@@ -625,8 +626,8 @@ def value_counts( | |
sort=sort, | ||
ascending=ascending, | ||
bins=bins, | ||
) | ||
ser.index.names = names | ||
).rename(name) | ||
jbrockmendel marked this conversation as resolved.
Show resolved
Hide resolved
|
||
ser.index.names = index_names | ||
return ser | ||
|
||
# groupby removes null keys from groupings | ||
|
@@ -736,11 +737,13 @@ def build_codes(lev_codes: np.ndarray) -> np.ndarray: | |
codes = [build_codes(lev_codes) for lev_codes in codes[:-1]] | ||
codes.append(left[-1]) | ||
|
||
mi = MultiIndex(levels=levels, codes=codes, names=names, verify_integrity=False) | ||
mi = MultiIndex( | ||
levels=levels, codes=codes, names=index_names, verify_integrity=False | ||
) | ||
|
||
if is_integer_dtype(out.dtype): | ||
out = ensure_int64(out) | ||
return self.obj._constructor(out, index=mi, name=self.obj.name) | ||
return self.obj._constructor(out, index=mi, name=name) | ||
|
||
def fillna( | ||
self, | ||
|
@@ -1920,7 +1923,7 @@ def value_counts( | |
male low FR 2 | ||
US 1 | ||
medium FR 1 | ||
dtype: int64 | ||
Name: count, dtype: int64 | ||
|
||
>>> df.groupby('gender').value_counts(ascending=True) | ||
gender education country | ||
|
@@ -1929,7 +1932,7 @@ def value_counts( | |
male low US 1 | ||
medium FR 1 | ||
low FR 2 | ||
dtype: int64 | ||
Name: count, dtype: int64 | ||
|
||
>>> df.groupby('gender').value_counts(normalize=True) | ||
gender education country | ||
|
@@ -1938,7 +1941,7 @@ def value_counts( | |
male low FR 0.50 | ||
US 0.25 | ||
medium FR 0.25 | ||
dtype: float64 | ||
Name: proportion, dtype: float64 | ||
|
||
>>> df.groupby('gender', as_index=False).value_counts() | ||
gender education country count | ||
|
@@ -1960,6 +1963,7 @@ def value_counts( | |
raise NotImplementedError( | ||
"DataFrameGroupBy.value_counts only handles axis=0" | ||
) | ||
name = "proportion" if normalize else "count" | ||
|
||
with self._group_selection_context(): | ||
df = self.obj | ||
|
@@ -1968,8 +1972,8 @@ def value_counts( | |
grouping.name for grouping in self.grouper.groupings if grouping.in_axis | ||
} | ||
if isinstance(self._selected_obj, Series): | ||
name = self._selected_obj.name | ||
keys = [] if name in in_axis_names else [self._selected_obj] | ||
_name = self._selected_obj.name | ||
keys = [] if _name in in_axis_names else [self._selected_obj] | ||
else: | ||
unique_cols = set(self._selected_obj.columns) | ||
if subset is not None: | ||
|
@@ -1992,8 +1996,8 @@ def value_counts( | |
keys = [ | ||
# Can't use .values because the column label needs to be preserved | ||
self._selected_obj.iloc[:, idx] | ||
for idx, name in enumerate(self._selected_obj.columns) | ||
if name not in in_axis_names and name in subsetted | ||
for idx, _name in enumerate(self._selected_obj.columns) | ||
if _name not in in_axis_names and _name in subsetted | ||
] | ||
|
||
groupings = list(self.grouper.groupings) | ||
|
@@ -2015,7 +2019,7 @@ def value_counts( | |
observed=self.observed, | ||
dropna=self.dropna, | ||
) | ||
result_series = cast(Series, gb.size()) | ||
result_series = cast(Series, gb.size()).rename(name) | ||
jbrockmendel marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
||
# GH-46357 Include non-observed categories | ||
# of non-grouping columns regardless of `observed` | ||
|
@@ -2059,14 +2063,12 @@ def value_counts( | |
result = result_series | ||
else: | ||
# Convert to frame | ||
name = "proportion" if normalize else "count" | ||
index = result_series.index | ||
columns = com.fill_missing_names(index.names) | ||
if name in columns: | ||
raise ValueError( | ||
f"Column label '{name}' is duplicate of result column" | ||
) | ||
result_series.name = name | ||
result_series.index = index.set_names(range(len(columns))) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. should set_names here get something different? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I don't think so, no - it looks like this is intentionally done to get around when there are duplicate column names e.g. if it instead was
then this would throw In [3]: df.groupby(["a", [0, 1], "d"], as_index=False).value_counts()
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
Cell In[3], line 1
----> 1 df.groupby(["a", [0, 1], "d"], as_index=False).value_counts()
File ~/pandas-dev/pandas/core/groupby/generic.py:2142, in DataFrameGroupBy.value_counts(self, subset, normalize, sort, ascending, dropna)
2138 raise ValueError(
2139 f"Column label '{name}' is duplicate of result column"
2140 )
2141 result_series.index.names = columns
-> 2142 result_frame = result_series.reset_index()
2143 result = result_frame
2144 return result.__finalize__(self.obj, method="value_counts")
File ~/pandas-dev/pandas/core/series.py:1541, in Series.reset_index(self, level, drop, name, inplace, allow_duplicates)
1538 name = self.name
1540 df = self.to_frame(name)
-> 1541 return df.reset_index(
1542 level=level, drop=drop, allow_duplicates=allow_duplicates
1543 )
1544 return None
File ~/pandas-dev/pandas/core/frame.py:6115, in DataFrame.reset_index(self, level, drop, inplace, col_level, col_fill, allow_duplicates, names)
6109 if lab is not None:
6110 # if we have the codes, extract the values with a mask
6111 level_values = algorithms.take(
6112 level_values, lab, allow_fill=True, fill_value=lev._na_value
6113 )
-> 6115 new_obj.insert(
6116 0,
6117 name,
6118 level_values,
6119 allow_duplicates=allow_duplicates,
6120 )
6122 new_obj.index = new_index
6123 if not inplace:
File ~/pandas-dev/pandas/core/frame.py:4682, in DataFrame.insert(self, loc, column, value, allow_duplicates)
4676 raise ValueError(
4677 "Cannot specify 'allow_duplicates=True' when "
4678 "'self.flags.allows_duplicate_labels' is False."
4679 )
4680 if not allow_duplicates and column in self.columns:
4681 # Should this be a different kind of error??
-> 4682 raise ValueError(f"cannot insert {column}, already exists")
4683 if not isinstance(loc, int):
4684 raise TypeError("loc must be int")
ValueError: cannot insert b, already exists |
||
result_frame = result_series.reset_index() | ||
result_frame.columns = columns + [name] | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Something more specific like
``value_counts`` sets the resulting name to ``"count"``
would be good