diff --git a/doc/api.rst b/doc/api.rst index 81cde3ea95..563559c556 100644 --- a/doc/api.rst +++ b/doc/api.rst @@ -45,6 +45,7 @@ Distribution plots distplot histplot + ecdfplot kdeplot rugplot diff --git a/doc/docstrings/ecdfplot.ipynb b/doc/docstrings/ecdfplot.ipynb new file mode 100644 index 0000000000..ef41c7c8c9 --- /dev/null +++ b/doc/docstrings/ecdfplot.ipynb @@ -0,0 +1,130 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Plot a univariate distribution along the x axis:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import seaborn as sns; sns.set()\n", + "penguins = sns.load_dataset(\"penguins\")\n", + "sns.ecdfplot(data=penguins, x=\"flipper_length_mm\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Flip the plot by assigning the data variable to the y axis:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "sns.ecdfplot(data=penguins, y=\"flipper_length_mm\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "If neither `x` nor `y` is assigned, the dataset is treated as wide-form, and a histogram is drawn for each numeric column:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "sns.ecdfplot(data=penguins.filter(like=\"culmen_\", axis=\"columns\"))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "You can also draw multiple histograms from a long-form dataset with hue mapping:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "sns.ecdfplot(data=penguins, x=\"culmen_length_mm\", hue=\"species\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The default distribution statistic is normalized to show a proportion, but you can show absolute counts instead:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "sns.ecdfplot(data=penguins, x=\"culmen_length_mm\", hue=\"species\", stat=\"count\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "It's also possible to plot the empirical complementary CDF (1 - CDF):" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "sns.ecdfplot(data=penguins, x=\"culmen_length_mm\", hue=\"species\", complementary=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "seaborn-refactor (py38)", + "language": "python", + "name": "seaborn-refactor" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.2" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/doc/docstrings/histplot.ipynb b/doc/docstrings/histplot.ipynb index 2efde821ef..8e8fc5f7e1 100644 --- a/doc/docstrings/histplot.ipynb +++ b/doc/docstrings/histplot.ipynb @@ -103,7 +103,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "You can also draw multiple histograms from a long-form dataset with hue mapping:" + "You can otherwise draw multiple histograms from a long-form dataset with hue mapping:" ] }, { diff --git a/doc/releases/v0.11.0.txt b/doc/releases/v0.11.0.txt index 53c6690c5c..d6e24b693a 100644 --- a/doc/releases/v0.11.0.txt +++ b/doc/releases/v0.11.0.txt @@ -9,7 +9,9 @@ v0.11.0 (Unreleased) Modernization of distribution functions ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -First, a new function, :func:`histplot` has been added. :func:`histplot` draws univariate or bivariate histograms with a number of features, including: +First, two new functions, :func:`histplot` and :func:`ecdfplot` has been added. + +:func:`histplot` draws univariate or bivariate histograms with a number of features, including: - mapping multiple distributions with a ``hue`` semantic - normalization to show density, probability, or frequency statistics @@ -17,6 +19,8 @@ First, a new function, :func:`histplot` has been added. :func:`histplot` draws u - adding a KDE fit to show a smoothed distribution over all bin statistics - experimental support for histograms over categorical and datetime variables. GH2125 +:func:`ecdfplot` draws univariate empirical cumulative distribution functions, using a similar interface. + Second, the existing functions :func:`kdeplot` and :func:`rugplot` have been completely overhauled. Two of the oldest functions in the library, these lacked aspects of the otherwise-common seaborn API, such as the ability to assign variables by name from a ``data`` object; they had no capacity for semantic mapping; and they had numerous other inconsistencies and smaller issues. The overhauled functions now share a common API with the rest of seaborn, they can show conditional distributions by mapping a third variable with a ``hue`` semantic, and have been improved in numerous other ways. The `github pull request (GH2104) `_ has a longer explanation of the changes and the motivation behind them. diff --git a/seaborn/_docstrings.py b/seaborn/_docstrings.py index 010dcdb8e5..a00bb79b04 100644 --- a/seaborn/_docstrings.py +++ b/seaborn/_docstrings.py @@ -116,6 +116,9 @@ def from_function_params(cls, func): """, kdeplot=""" kdeplot : Plot univariate or bivariate distributions using kernel density estimation. + """, + ecdfplot=""" +ecdfplot : Plot empirical cumulative distribution functions. """, rugplot=""" rugplot : Plot a tick at each observation value along the x and/or y axes. diff --git a/seaborn/_statistics.py b/seaborn/_statistics.py index 951795c100..40d5801364 100644 --- a/seaborn/_statistics.py +++ b/seaborn/_statistics.py @@ -1,3 +1,29 @@ +"""Statistical transformations for visualization. + +This module is currently private, but is being written to eventually form part +of the public API. + +The classes should behave roughly in the style of scikit-learn. + +- All data-independent parameters should be passed to the class constructor. +- Each class should impelment a default transformation that is exposed through + __call__. These are currently written for vector arguements, but I think + consuming a whole `plot_data` DataFrame and return it with transformed + variables would make more sense. +- Some class have data-dependent preprocessing that should be cached and used + multiple times (think defining histogram bins off all data and then counting + observations within each bin multiple times per data subsets). These currently + have unique names, but it would be good to have a common name. Not quite + `fit`, but something similar. +- Alternatively, the transform interface could take some information about grouping + variables and do a groupby internally. +- Some classes should define alternate transforms that might make the most sense + with a different function. For example, KDE usually evaluates the distribution + on a regular grid, but it would be useful for it to transform at the actual + datapoints. Then again, this could be controlled by a parameter at the time of + class instantiation. + +""" from distutils.version import LooseVersion from numbers import Number import numpy as np @@ -345,3 +371,56 @@ def __call__(self, x1, x2=None, weights=None): return self._eval_univariate(x1, weights) else: return self._eval_bivariate(x1, x2, weights) + + +class ECDF: + """Univariate empirical cumulative distribution estimator.""" + def __init__(self, stat="proportion", complementary=False): + """Initialize the class with its paramters + + Parameters + ---------- + stat : {{"proportion", "count"}} + Distribution statistic to compute. + complementary : bool + If True, use the complementary CDF (1 - CDF) + + """ + _check_argument("stat", ["count", "proportion"], stat) + self.stat = stat + self.complementary = complementary + + def _eval_bivariate(self, x1, x2, weights): + """Inner function for ECDF of two variables.""" + raise NotImplementedError("Bivariate ECDF is not implemented") + + def _eval_univariate(self, x, weights): + """Inner function for ECDF of one variable.""" + sorter = x.argsort() + x = x[sorter] + weights = weights[sorter] + y = weights.cumsum() + + if self.stat == "proportion": + y = y / y.max() + + x = np.r_[-np.inf, x] + y = np.r_[0, y] + + if self.complementary: + y = y.max() - y + + return y, x + + def __call__(self, x1, x2=None, weights=None): + """Return proportion or count of observations below each sorted datapoint.""" + x1 = np.asarray(x1) + if weights is None: + weights = np.ones_like(x1) + else: + weights = np.asarray(weights) + + if x2 is None: + return self._eval_univariate(x1, weights) + else: + return self._eval_bivariate(x1, x2, weights) diff --git a/seaborn/distributions.py b/seaborn/distributions.py index c26f583745..54d06a9afe 100644 --- a/seaborn/distributions.py +++ b/seaborn/distributions.py @@ -18,6 +18,7 @@ from ._statistics import ( KDE, Histogram, + ECDF, ) from .utils import ( remove_na, @@ -33,7 +34,7 @@ ) -__all__ = ["distplot", "histplot", "kdeplot", "rugplot"] +__all__ = ["distplot", "histplot", "kdeplot", "ecdfplot", "rugplot"] # ==================================================================================== # # Module documentation @@ -75,6 +76,7 @@ dist=DocstringComponents(_dist_params), kde=DocstringComponents.from_function_params(KDE.__init__), hist=DocstringComponents.from_function_params(Histogram.__init__), + ecdf=DocstringComponents.from_function_params(ECDF.__init__), ) @@ -538,8 +540,12 @@ def plot_univariate_histogram( y = np.append(hist["heights"], final["heights"]) b = np.append(bottom, bottom[-1]) - step = "post" - drawstyle = "steps-post" + if self.data_variable == "x": + step = "post" + drawstyle = "steps-post" + else: + step = "post" # fillbetweenx handles mapping internally + drawstyle = "steps-pre" elif element == "poly": @@ -1128,6 +1134,70 @@ def plot_bivariate_density( ax, artist, fill, False, "layer", 1, artist_kws, {}, ) + def plot_univariate_ecdf(self, estimate_kws, legend, plot_kws, ax): + + # TODO see notes elsewhere about GH2135 + cols = list(self.variables) + + estimator = ECDF(**estimate_kws) + + # Set the draw style to step the right way for the data varible + drawstyles = dict(x="steps-post", y="steps-pre") + plot_kws["drawstyle"] = drawstyles[self.data_variable] + + # Loop through the subsets, transform and plot the data + for sub_vars, sub_data in self._semantic_subsets( + "hue", reverse=True, from_comp_data=True, + ): + + # Compute the ECDF + sub_data = sub_data[cols].dropna() + if sub_data.empty: + continue + + observations = sub_data[self.data_variable] + weights = sub_data.get("weights", None) + stat, vals = estimator(observations, weights) + + # Assign attributes based on semantic mapping + artist_kws = plot_kws.copy() + if "hue" in self.variables: + artist_kws["color"] = self._hue_map(sub_vars["hue"]) + + # Work out the orientation of the plot + if self.data_variable == "x": + plot_args = vals, stat + stat_variable = "y" + else: + plot_args = stat, vals + stat_variable = "x" + + if estimator.stat == "count": + top_edge = len(observations) + else: + top_edge = 1 + + # Draw the line for this subset + artist, = ax.plot(*plot_args, **artist_kws) + sticky_edges = getattr(artist.sticky_edges, stat_variable) + sticky_edges[:] = 0, top_edge + + # --- Finalize the plot ---- + stat = estimator.stat.capitalize() + default_x = default_y = "" + if self.data_variable == "x": + default_y = stat + if self.data_variable == "y": + default_x = stat + self._add_axis_labels(ax, default_x, default_y) + + if "hue" in self.variables and legend: + artist = partial(mpl.lines.Line2D, [], []) + alpha = plot_kws.get("alpha", 1) + self._add_legend( + ax, artist, False, False, None, alpha, plot_kws, {}, + ) + def plot_rug(self, height, expand_margins, legend, ax, kws): kws = _normalize_kwargs(kws, mpl.lines.Line2D) @@ -1413,6 +1483,7 @@ def histplot( -------- {seealso.kdeplot} {seealso.rugplot} +{seealso.ecdfplot} {seealso.jointplot} distplot @@ -1719,9 +1790,10 @@ def kdeplot( See Also -------- +{seealso.violinplot} {seealso.histplot} +{seealso.ecdfplot} {seealso.rugplot} -{seealso.violinplot} {seealso.jointplot} distplot @@ -1770,6 +1842,111 @@ def kdeplot( ) +def ecdfplot( + data=None, *, + # Vector variables + x=None, y=None, hue=None, weights=None, + # Computation parameters + stat="proportion", complementary=False, + # Hue mapping parameters + palette=None, hue_order=None, hue_norm=None, + # Axes information + log_scale=None, legend=True, ax=None, + # Other appearance keywords + **kwargs, +): + + p = _DistributionPlotter( + data=data, + variables=_DistributionPlotter.get_semantics(locals()) + ) + + p.map_hue(palette=palette, order=hue_order, norm=hue_norm) + + # We could support other semantics (size, style) here fairly easily + # But it would make distplot a bit more complicated. + # It's always possible to add features like that later, so I am going to defer. + # It will be even easier to wait until after there is a more general/abstract + # way to go from semantic specs to artist attributes. + + if ax is None: + ax = plt.gca() + + # We could add this one day, but it's of dubious value + if not p.univariate: + raise NotImplementedError("Bivariate ECDF plots are not implemented") + + # Attach the axes to the plotter, setting up unit conversions + p._attach(ax, log_scale=log_scale) + + estimate_kws = dict( + stat=stat, + complementary=complementary, + ) + + p.plot_univariate_ecdf( + estimate_kws=estimate_kws, + legend=legend, + plot_kws=kwargs, + ax=ax, + ) + + return ax + + +ecdfplot.__doc__ = """\ +Plot empirical cumulative distribution functions. + +An ECDF represents the proportion or count of observations falling below each +unique value in a dataset. Compared to a histogram or density plot, it has the +advantage that each observation is visualized directly, meaning that there are +no binning or smoothing parameters that need to be adjusted. It also aids direct +comparisons between multiple distributions. A downside is that the relationship +between the appearance of the plot and the basic properties of the distribution +(such as its central tendency, variance, and the presence of any bimodality) +may not be as intuitive. + +More information is provided in the :ref:`user guide `. + +Parameters +---------- +{params.core.data} +{params.core.xy} +{params.core.hue} +{params.ecdf.stat} +{params.ecdf.complementary} +{params.core.palette} +{params.core.hue_order} +{params.core.hue_norm} +{params.dist.log_scale} +{params.dist.legend} +{params.core.ax} +kwargs + Other keyword arguments are passed to :meth:`matplotlib.axes.Axes.plot`. + +Returns +------- +{returns.ax} + +See Also +-------- +{seealso.histplot} +{seealso.kdeplot} +{seealso.rugplot} +distplot + +Examples +-------- + +.. include:: ../docstrings/ecdfplot.rst + +""".format( + params=_param_docs, + returns=_core_docs["returns"], + seealso=_core_docs["seealso"], +) + + @_deprecate_positional_args def rugplot( x=None, # Allow positional x, because behavior won't change diff --git a/seaborn/tests/test_distributions.py b/seaborn/tests/test_distributions.py index 2ea44faa23..2593eca382 100644 --- a/seaborn/tests/test_distributions.py +++ b/seaborn/tests/test_distributions.py @@ -26,8 +26,9 @@ from ..distributions import ( _DistributionPlotter, histplot, - rugplot, + ecdfplot, kdeplot, + rugplot, ) @@ -1793,3 +1794,94 @@ def test_colorbar(self, long_df): f, (ax, cax) = plt.subplots(2) histplot(long_df, x="x", y="y", cbar=True, cbar_ax=cax, ax=ax) assert len(ax.figure.axes) == 2 + + +class TestECDFPlotUnivariate: + + @pytest.mark.parametrize("variable", ["x", "y"]) + def test_long_vectors(self, long_df, variable): + + vector = long_df[variable] + vectors = [ + variable, vector, np.asarray(vector), vector.tolist(), + ] + + f, ax = plt.subplots() + for vector in vectors: + ecdfplot(data=long_df, ax=ax, **{variable: vector}) + + xdata = [l.get_xdata() for l in ax.lines] + for a, b in itertools.product(xdata, xdata): + assert_array_equal(a, b) + + ydata = [l.get_ydata() for l in ax.lines] + for a, b in itertools.product(ydata, ydata): + assert_array_equal(a, b) + + def test_hue(self, long_df): + + ax = ecdfplot(long_df, x="x", hue="a") + + for line, color in zip(ax.lines[::-1], color_palette()): + assert line.get_color() == color + + def test_line_kwargs(self, long_df): + + color = "r" + ls = "--" + lw = 3 + ax = ecdfplot(long_df, x="x", color=color, ls=ls, lw=lw) + + for line in ax.lines: + assert to_rgb(line.get_color()) == to_rgb(color) + assert line.get_linestyle() == ls + assert line.get_linewidth() == lw + + @pytest.mark.parametrize("data_var", ["x", "y"]) + def test_drawstyle(self, flat_series, data_var): + + ax = ecdfplot(**{data_var: flat_series}) + drawstyles = dict(x="steps-post", y="steps-pre") + assert ax.lines[0].get_drawstyle() == drawstyles[data_var] + + @pytest.mark.parametrize( + "data_var,stat_var", [["x", "y"], ["y", "x"]], + ) + def test_proportion_limits(self, flat_series, data_var, stat_var): + + ax = ecdfplot(**{data_var: flat_series}) + data = getattr(ax.lines[0], f"get_{stat_var}data")() + assert data[0] == 0 + assert data[-1] == 1 + sticky_edges = getattr(ax.lines[0].sticky_edges, stat_var) + assert sticky_edges[:] == [0, 1] + + @pytest.mark.parametrize( + "data_var,stat_var", [["x", "y"], ["y", "x"]], + ) + def test_proportion_limits_complementary(self, flat_series, data_var, stat_var): + + ax = ecdfplot(**{data_var: flat_series}, complementary=True) + data = getattr(ax.lines[0], f"get_{stat_var}data")() + assert data[0] == 1 + assert data[-1] == 0 + sticky_edges = getattr(ax.lines[0].sticky_edges, stat_var) + assert sticky_edges[:] == [0, 1] + + @pytest.mark.parametrize( + "data_var,stat_var", [["x", "y"], ["y", "x"]], + ) + def test_proportion_count(self, flat_series, data_var, stat_var): + + n = len(flat_series) + ax = ecdfplot(**{data_var: flat_series}, stat="count") + data = getattr(ax.lines[0], f"get_{stat_var}data")() + assert data[0] == 0 + assert data[-1] == n + sticky_edges = getattr(ax.lines[0].sticky_edges, stat_var) + assert sticky_edges[:] == [0, n] + + def test_bivariate_error(self, long_df): + + with pytest.raises(NotImplementedError, match="Bivariate ECDF plots"): + ecdfplot(data=long_df, x="x", y="y") diff --git a/seaborn/tests/test_statistics.py b/seaborn/tests/test_statistics.py index 87a63169cf..4590849e3a 100644 --- a/seaborn/tests/test_statistics.py +++ b/seaborn/tests/test_statistics.py @@ -1,15 +1,36 @@ import numpy as np from scipy import integrate +try: + import statsmodels.distributions as smdist +except ImportError: + smdist = None + import pytest -from numpy.testing import assert_array_equal +from numpy.testing import assert_array_equal, assert_array_almost_equal from .._statistics import ( KDE, Histogram, + ECDF, ) +class DistributionFixtures: + + @pytest.fixture + def x(self, rng): + return rng.normal(0, 1, 100) + + @pytest.fixture + def y(self, rng): + return rng.normal(0, 5, 100) + + @pytest.fixture + def weights(self, rng): + return rng.uniform(0, 5, 100) + + class TestKDE: def test_gridsize(self, rng): @@ -127,15 +148,7 @@ def test_bivariate_cumulative(self, rng): assert density[-1, -1] == pytest.approx(1, abs=1e-2) -class TestHistogram: - - @pytest.fixture - def x(self, rng): - return rng.normal(0, 1, 100) - - @pytest.fixture - def y(self, rng): - return rng.normal(0, 5, 100) +class TestHistogram(DistributionFixtures): def test_string_bins(self, x): @@ -379,3 +392,66 @@ def test_bad_stat(self): with pytest.raises(ValueError): Histogram(stat="invalid") + + +class TestECDF(DistributionFixtures): + + def test_univariate_proportion(self, x): + + ecdf = ECDF() + stat, vals = ecdf(x) + assert_array_equal(vals[1:], np.sort(x)) + assert_array_almost_equal(stat[1:], np.linspace(0, 1, len(x) + 1)[1:]) + assert stat[0] == 0 + + def test_univariate_count(self, x): + + ecdf = ECDF(stat="count") + stat, vals = ecdf(x) + + assert_array_equal(vals[1:], np.sort(x)) + assert_array_almost_equal(stat[1:], np.arange(len(x)) + 1) + assert stat[0] == 0 + + def test_univariate_proportion_weights(self, x, weights): + + ecdf = ECDF() + stat, vals = ecdf(x, weights=weights) + assert_array_equal(vals[1:], np.sort(x)) + expected_stats = weights[x.argsort()].cumsum() / weights.sum() + assert_array_almost_equal(stat[1:], expected_stats) + assert stat[0] == 0 + + def test_univariate_count_weights(self, x, weights): + + ecdf = ECDF(stat="count") + stat, vals = ecdf(x, weights=weights) + assert_array_equal(vals[1:], np.sort(x)) + assert_array_almost_equal(stat[1:], weights[x.argsort()].cumsum()) + assert stat[0] == 0 + + @pytest.mark.skipif(smdist is None, reason="Requires statsmodels") + def test_against_statsmodels(self, x): + + sm_ecdf = smdist.empirical_distribution.ECDF(x) + + ecdf = ECDF() + stat, vals = ecdf(x) + assert_array_equal(vals, sm_ecdf.x) + assert_array_almost_equal(stat, sm_ecdf.y) + + ecdf = ECDF(complementary=True) + stat, vals = ecdf(x) + assert_array_equal(vals, sm_ecdf.x) + assert_array_almost_equal(stat, sm_ecdf.y[::-1]) + + def test_invalid_stat(self, x): + + with pytest.raises(ValueError, match="`stat` must be one of"): + ECDF(stat="density") + + def test_bivariate_error(self, x, y): + + with pytest.raises(NotImplementedError, match="Bivariate ECDF"): + ecdf = ECDF() + ecdf(x, y)