Skip to content

Commit

Permalink
Add percent-based normalization in histplot (#2461)
Browse files Browse the repository at this point in the history
* Add percent-based normalization in histplot

* Test bivariate normalization

(cherry picked from commit d0acb8c)
  • Loading branch information
mwaskom committed Aug 7, 2021
1 parent cfb9745 commit 23d2442
Show file tree
Hide file tree
Showing 3 changed files with 24 additions and 16 deletions.
14 changes: 0 additions & 14 deletions doc/releases/v0.12.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -19,20 +19,6 @@ v0.12.0 (Unreleased)

- |Enhancement| In :func:`histplot`, added `stat="percent"` as an option for normalization such that bar heights sum to 100 (:pr:`2461`).

- |Enhancement| In :func:`histplot`, improved performance with large datasets and many groupings/facets (:pr:`2559`, :pr:`2570`).

- |Enhancement| In :func:`kdeplot`, added the `warn_singular` parameter to allow silencing of the warning about data with zero variance (:pr:`2566`).

- |Enhancement| In :class:`FacetGrid`, :class:`PairGrid`, and functions that use them, the matplotlib `figure.autolayout` parameter is disabled to avoid having the legend overlap the plot (:pr:`2571`).

- |Enhancement| In :class:`FacetGrid` and functions that use it, the visibility of interior axis labels is now disabled, and exterior axis labels are no longer erased when adding additional layers (:pr:`2583`).

- |Enhancement| |Fix| Improved integration with the matplotlib color cycle in most axes-level functions (:pr:`2449`).

- |Feature| Added a ``refline`` method to :class:`FacetGrid` and :class:`JointGrid` for including horizontal and/or vertical reference lines using :meth:`matplotlib.axes.Axes.axhline`/:meth:`matplotlib.axes.Axes.axvline` (:pr:`2620`).

- |API| |Enhancement| In :func:`lmplot`, added a new `facet_kws` parameter and deprecated the `sharex`, `sharey`, and `legend_out` parameters from the function signature; pass them in the `facet_kws` dictionary (:pr:`2576`).

- |Fix| In :func:`lineplot, allowed the `dashes` keyword to set the style of a line without mapping a `style` variable (:pr:`2449`).

- |Fix| In :func:`rugplot`, fixed a bug that prevented the use of datetime data (:pr:`2458`).
Expand Down
9 changes: 7 additions & 2 deletions seaborn/_statistics.py
Original file line number Diff line number Diff line change
Expand Up @@ -204,7 +204,7 @@ def __init__(
Parameters
----------
stat : {"count", "frequency", "density", "probability"}
stat : {"count", "frequency", "density", "probability", "percent"}
Aggregate statistic to compute in each bin.
- ``count`` shows the number of observations
Expand All @@ -229,7 +229,8 @@ def __init__(
If True, return the cumulative statistic.
"""
_check_argument("stat", ["count", "density", "probability", "frequency"], stat)
stat_choices = ["count", "frequency", "density", "probability", "percent"]
_check_argument("stat", stat_choices, stat)

self.stat = stat
self.bins = bins
Expand Down Expand Up @@ -337,6 +338,8 @@ def _eval_bivariate(self, x1, x2, weights):

if self.stat == "probability":
hist = hist.astype(float) / hist.sum()
elif self.stat == "percent":
hist = hist.astype(float) / hist.sum() * 100
elif self.stat == "frequency":
hist = hist.astype(float) / area

Expand All @@ -361,6 +364,8 @@ def _eval_univariate(self, x, weights):

if self.stat == "probability":
hist = hist.astype(float) / hist.sum()
elif self.stat == "percent":
hist = hist.astype(float) / hist.sum() * 100
elif self.stat == "frequency":
hist = hist.astype(float) / np.diff(bin_edges)

Expand Down
17 changes: 17 additions & 0 deletions seaborn/tests/test_distributions.py
Original file line number Diff line number Diff line change
Expand Up @@ -1258,6 +1258,12 @@ def test_probability_stat_unique_norm(self, long_df):
bar_heights = [b.get_height() for b in bars]
assert sum(bar_heights) == pytest.approx(1)

def test_percent_stat(self, flat_series):

ax = histplot(flat_series, stat="percent")
bar_heights = [b.get_height() for b in ax.patches]
assert sum(bar_heights) == 100

def test_common_bins(self, long_df):

n = 10
Expand Down Expand Up @@ -1796,6 +1802,17 @@ def test_mesh_unique_norm(self, long_df):
density, (x_edges, y_edges) = sub_hist(sub_df["x"], sub_df["y"])
assert_array_equal(mesh_data.data, density.T.flat)

@pytest.mark.parametrize("stat", ["probability", "percent"])
def test_mesh_normalization(self, long_df, stat):

ax = histplot(
long_df, x="x", y="y", stat=stat,
)

mesh_data = ax.collections[0].get_array()
expected_sum = {"probability": 1, "percent": 100}[stat]
assert mesh_data.data.sum() == expected_sum

def test_mesh_colors(self, long_df):

color = "r"
Expand Down

0 comments on commit 23d2442

Please sign in to comment.