From c8745a44afc1d79664fc56fcc7931f09a6fa5e84 Mon Sep 17 00:00:00 2001 From: Michael Waskom <mwaskom@nyu.edu> Date: Mon, 15 Jun 2020 16:53:46 -0400 Subject: [PATCH 01/11] Add basic ecdfplot implementation --- seaborn/_statistics.py | 36 +++++++++++++ seaborn/distributions.py | 107 ++++++++++++++++++++++++++++++++++++++- 2 files changed, 142 insertions(+), 1 deletion(-) diff --git a/seaborn/_statistics.py b/seaborn/_statistics.py index 951795c100..8ab07501cb 100644 --- a/seaborn/_statistics.py +++ b/seaborn/_statistics.py @@ -345,3 +345,39 @@ def __call__(self, x1, x2=None, weights=None): return self._eval_univariate(x1, weights) else: return self._eval_bivariate(x1, x2, weights) + + +class ECDF: + + def __init__(self, stat="proportion"): + + _check_argument("stat", ["count", "proportion"], stat) + self.stat = stat + + # Do we need bivariate ECDF? + + def _eval_univariate(self, x, weights): + + sorter = np.argsort(x) + x = x[sorter] + weights = weights[sorter] + + y = weights.cumsum() + + if self.stat == "proportion": + y = y / y.max() + + x = np.r_[-np.inf, x] + y = np.r_[0, y] + + return y, x + + def __call__(self, x1, weights=None): + + x1 = np.asarray(x1) + if weights is None: + weights = np.ones_like(x1) + else: + weights = np.asarray(weights) + + return self._eval_univariate(x1, weights) diff --git a/seaborn/distributions.py b/seaborn/distributions.py index c26f583745..f56686fb21 100644 --- a/seaborn/distributions.py +++ b/seaborn/distributions.py @@ -18,6 +18,7 @@ from ._statistics import ( KDE, Histogram, + ECDF, ) from .utils import ( remove_na, @@ -33,7 +34,7 @@ ) -__all__ = ["distplot", "histplot", "kdeplot", "rugplot"] +__all__ = ["distplot", "histplot", "kdeplot", "ecdfplot", "rugplot"] # ==================================================================================== # # Module documentation @@ -1128,6 +1129,59 @@ def plot_bivariate_density( ax, artist, fill, False, "layer", 1, artist_kws, {}, ) + def plot_univariate_ecdf(self, estimate_kws, legend, plot_kws, ax): + + # TODO see notes elsewhere about GH2135 + cols = list(self.variables) + + # TODO maybe have an option for joint start/end? + estimator = ECDF(**estimate_kws) + + # Loop through the subsets, transform and plot the data + for sub_vars, sub_data in self._semantic_subsets( + "hue", reverse=True, from_comp_data=True, + ): + + # "Compute" the ECDF + sub_data = sub_data[cols].dropna() + observations = sub_data[self.data_variable] + weights = sub_data.get("weights", None) + stat, vals = estimator(observations, weights) + + # Assign attributes based on semantic mapping + artist_kws = plot_kws.copy() + if "hue" in self.variables: + artist_kws["color"] = self._hue_map(sub_vars["hue"]) + + # Work out the orientation of the plot + if self.data_variable == "x": + plot_args = vals, stat + stat_variable = "y" + else: + plot_args = stat, vals + stat_variable = "x" + + # Draw the line for this subset + artist, = ax.plot(*plot_args, drawstyle="steps-post", **artist_kws) + sticky_edges = getattr(artist.sticky_edges, stat_variable) + sticky_edges[:] = 0, 1 + + # --- Finalize the plot ---- + stat = estimator.stat.capitalize() + default_x = default_y = "" + if self.data_variable == "x": + default_y = stat + if self.data_variable == "y": + default_x = stat + self._add_axis_labels(ax, default_x, default_y) + + if "hue" in self.variables and legend: + artist = partial(mpl.lines.Line2D, [], []) + alpha = plot_kws.get("alpha", 1) + self._add_legend( + ax, artist, False, False, None, alpha, plot_kws, {}, + ) + def plot_rug(self, height, expand_margins, legend, ax, kws): kws = _normalize_kwargs(kws, mpl.lines.Line2D) @@ -1770,6 +1824,57 @@ def kdeplot( ) +def ecdfplot( + data=None, *, + # Vector variables + x=None, y=None, hue=None, weights=None, + # Computation parameters + stat="proportion", + # Hue mapping parameters + palette=None, hue_order=None, hue_norm=None, color=None, + # Axes information + log_scale=None, legend=True, ax=None, + # Other appearance keywords + **kwargs, +): + + p = _DistributionPlotter( + data=data, + variables=_DistributionPlotter.get_semantics(locals()) + ) + + p.map_hue(palette=palette, order=hue_order, norm=hue_norm) + + # We could support other semantics (size, style) here fairly easily + # But it would make distplot a bit more complicated. + # It's always possible to add features like that later, so I am going to defer. + # It will be even easier to wait until after there is a more general/abstract + # way to go from semantic specs to artist attributes. + + if ax is None: + ax = plt.gca() + + # We could add this one day, but it's of dubious value + if not p.univariate: + raise NotImplementedError("Bivariate ECDF plots are not implemented") + + # Attach the axes to the plotter, setting up unit conversions + p._attach(ax, log_scale=log_scale) + + estimate_kws = dict( + stat=stat, + ) + + p.plot_univariate_ecdf( + estimate_kws=estimate_kws, + legend=legend, + plot_kws=kwargs, + ax=ax, + ) + + return ax + + @_deprecate_positional_args def rugplot( x=None, # Allow positional x, because behavior won't change From 96744c27fe614d14139f02085c2d879bff333ad1 Mon Sep 17 00:00:00 2001 From: Michael Waskom <mwaskom@nyu.edu> Date: Mon, 15 Jun 2020 17:03:28 -0400 Subject: [PATCH 02/11] Allow user to override drawstyle --- seaborn/distributions.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/seaborn/distributions.py b/seaborn/distributions.py index f56686fb21..c3a16c2c98 100644 --- a/seaborn/distributions.py +++ b/seaborn/distributions.py @@ -1137,6 +1137,9 @@ def plot_univariate_ecdf(self, estimate_kws, legend, plot_kws, ax): # TODO maybe have an option for joint start/end? estimator = ECDF(**estimate_kws) + # Allow other drawstyles (I'm not sure why you'd want them) + plot_kws.setdefault("drawstyle", "steps-post") + # Loop through the subsets, transform and plot the data for sub_vars, sub_data in self._semantic_subsets( "hue", reverse=True, from_comp_data=True, @@ -1162,7 +1165,7 @@ def plot_univariate_ecdf(self, estimate_kws, legend, plot_kws, ax): stat_variable = "x" # Draw the line for this subset - artist, = ax.plot(*plot_args, drawstyle="steps-post", **artist_kws) + artist, = ax.plot(*plot_args, **artist_kws) sticky_edges = getattr(artist.sticky_edges, stat_variable) sticky_edges[:] = 0, 1 From d2d3dca8917717b072b6a1f7734829f8307d0ce1 Mon Sep 17 00:00:00 2001 From: Michael Waskom <mwaskom@nyu.edu> Date: Mon, 15 Jun 2020 20:53:15 -0400 Subject: [PATCH 03/11] Add unit tests --- seaborn/_statistics.py | 6 +-- seaborn/distributions.py | 7 ++- seaborn/tests/test_distributions.py | 73 ++++++++++++++++++++++++- seaborn/tests/test_statistics.py | 84 +++++++++++++++++++++++++---- 4 files changed, 155 insertions(+), 15 deletions(-) diff --git a/seaborn/_statistics.py b/seaborn/_statistics.py index 8ab07501cb..2e31831dec 100644 --- a/seaborn/_statistics.py +++ b/seaborn/_statistics.py @@ -357,8 +357,8 @@ def __init__(self, stat="proportion"): # Do we need bivariate ECDF? def _eval_univariate(self, x, weights): - - sorter = np.argsort(x) + """Inner function for ECDF of one variable.""" + sorter = x.argsort() x = x[sorter] weights = weights[sorter] @@ -373,7 +373,7 @@ def _eval_univariate(self, x, weights): return y, x def __call__(self, x1, weights=None): - + """Return proportion or count of observations below each sorted datapoint.""" x1 = np.asarray(x1) if weights is None: weights = np.ones_like(x1) diff --git a/seaborn/distributions.py b/seaborn/distributions.py index c3a16c2c98..141c360b33 100644 --- a/seaborn/distributions.py +++ b/seaborn/distributions.py @@ -1164,10 +1164,15 @@ def plot_univariate_ecdf(self, estimate_kws, legend, plot_kws, ax): plot_args = stat, vals stat_variable = "x" + if estimator.stat == "count": + top_edge = len(observations) + else: + top_edge = 1 + # Draw the line for this subset artist, = ax.plot(*plot_args, **artist_kws) sticky_edges = getattr(artist.sticky_edges, stat_variable) - sticky_edges[:] = 0, 1 + sticky_edges[:] = 0, top_edge # --- Finalize the plot ---- stat = estimator.stat.capitalize() diff --git a/seaborn/tests/test_distributions.py b/seaborn/tests/test_distributions.py index 2ea44faa23..60bebb42ed 100644 --- a/seaborn/tests/test_distributions.py +++ b/seaborn/tests/test_distributions.py @@ -26,8 +26,9 @@ from ..distributions import ( _DistributionPlotter, histplot, - rugplot, + ecdfplot, kdeplot, + rugplot, ) @@ -1793,3 +1794,73 @@ def test_colorbar(self, long_df): f, (ax, cax) = plt.subplots(2) histplot(long_df, x="x", y="y", cbar=True, cbar_ax=cax, ax=ax) assert len(ax.figure.axes) == 2 + + +class TestECDFPlotUnivariate: + + @pytest.mark.parametrize("variable", ["x", "y"]) + def test_long_vectors(self, long_df, variable): + + vector = long_df[variable] + vectors = [ + variable, vector, np.asarray(vector), vector.tolist(), + ] + + f, ax = plt.subplots() + for vector in vectors: + ecdfplot(data=long_df, ax=ax, **{variable: vector}) + + xdata = [l.get_xdata() for l in ax.lines] + for a, b in itertools.product(xdata, xdata): + assert_array_equal(a, b) + + ydata = [l.get_ydata() for l in ax.lines] + for a, b in itertools.product(ydata, ydata): + assert_array_equal(a, b) + + def test_hue(self, long_df): + + ax = ecdfplot(long_df, x="x", hue="a") + + for line, color in zip(ax.lines[::-1], color_palette()): + assert line.get_color() == color + + def test_line_kwargs(self, long_df): + + ls = "--" + lw = 3 + ax = ecdfplot(long_df, x="x", hue="a", ls=ls, lw=lw) + + for line in ax.lines: + assert line.get_linestyle() == ls + assert line.get_linewidth() == lw + + @pytest.mark.parametrize( + "data_var,stat_var", [["x", "y"], ["y", "x"]], + ) + def test_proportion_limits(self, flat_series, data_var, stat_var): + + ax = ecdfplot(**{data_var: flat_series}) + data = getattr(ax.lines[0], f"get_{stat_var}data")() + assert data[0] == 0 + assert data[-1] == 1 + sticky_edges = getattr(ax.lines[0].sticky_edges, stat_var) + assert sticky_edges[:] == [0, 1] + + @pytest.mark.parametrize( + "data_var,stat_var", [["x", "y"], ["y", "x"]], + ) + def test_proportion_count(self, flat_series, data_var, stat_var): + + n = len(flat_series) + ax = ecdfplot(**{data_var: flat_series}, stat="count") + data = getattr(ax.lines[0], f"get_{stat_var}data")() + assert data[0] == 0 + assert data[-1] == n + sticky_edges = getattr(ax.lines[0].sticky_edges, stat_var) + assert sticky_edges[:] == [0, n] + + def test_bivariate_error(self, long_df): + + with pytest.raises(NotImplementedError, match="Bivariate ECDF plots"): + ecdfplot(data=long_df, x="x", y="y") diff --git a/seaborn/tests/test_statistics.py b/seaborn/tests/test_statistics.py index 87a63169cf..4d246371ee 100644 --- a/seaborn/tests/test_statistics.py +++ b/seaborn/tests/test_statistics.py @@ -1,15 +1,36 @@ import numpy as np from scipy import integrate +try: + import statsmodels.distributions as smdist +except ImportError: + smdist = None + import pytest -from numpy.testing import assert_array_equal +from numpy.testing import assert_array_equal, assert_array_almost_equal from .._statistics import ( KDE, Histogram, + ECDF, ) +class DistributionFixtures: + + @pytest.fixture + def x(self, rng): + return rng.normal(0, 1, 100) + + @pytest.fixture + def y(self, rng): + return rng.normal(0, 5, 100) + + @pytest.fixture + def weights(self, rng): + return rng.uniform(0, 5, 100) + + class TestKDE: def test_gridsize(self, rng): @@ -127,15 +148,7 @@ def test_bivariate_cumulative(self, rng): assert density[-1, -1] == pytest.approx(1, abs=1e-2) -class TestHistogram: - - @pytest.fixture - def x(self, rng): - return rng.normal(0, 1, 100) - - @pytest.fixture - def y(self, rng): - return rng.normal(0, 5, 100) +class TestHistogram(DistributionFixtures): def test_string_bins(self, x): @@ -379,3 +392,54 @@ def test_bad_stat(self): with pytest.raises(ValueError): Histogram(stat="invalid") + + +class TestECDF(DistributionFixtures): + + def test_univariate_proportion(self, x): + + ecdf = ECDF() + stat, vals = ecdf(x) + assert_array_equal(vals[1:], np.sort(x)) + assert_array_almost_equal(stat[1:], np.linspace(0, 1, len(x) + 1)[1:]) + assert stat[0] == 0 + + def test_univariate_count(self, x): + + ecdf = ECDF(stat="count") + stat, vals = ecdf(x) + + assert_array_equal(vals[1:], np.sort(x)) + assert_array_almost_equal(stat[1:], np.arange(len(x)) + 1) + assert stat[0] == 0 + + def test_univariate_proportion_weights(self, x, weights): + + ecdf = ECDF() + stat, vals = ecdf(x, weights=weights) + assert_array_equal(vals[1:], np.sort(x)) + assert_array_almost_equal(stat[1:], weights[x.argsort()].cumsum() / weights.sum()) + assert stat[0] == 0 + + def test_univariate_count_weights(self, x, weights): + + ecdf = ECDF(stat="count") + stat, vals = ecdf(x, weights=weights) + assert_array_equal(vals[1:], np.sort(x)) + assert_array_almost_equal(stat[1:], weights[x.argsort()].cumsum()) + assert stat[0] == 0 + + @pytest.mark.skipif(smdist is None, reason="Requires statsmodels") + def test_against_statsmodels(self, x): + + ecdf = ECDF() + stat, vals = ecdf(x) + + sm_ecdf = smdist.empirical_distribution.ECDF(x) + assert_array_equal(vals, sm_ecdf.x) + assert_array_almost_equal(stat, sm_ecdf.y) + + def test_invalid_stat(self, x): + + with pytest.raises(ValueError, match="`stat` must be one of"): + ECDF(stat="density") \ No newline at end of file From a3250efdaa746476369b7a46a97b95e1679e19f3 Mon Sep 17 00:00:00 2001 From: Michael Waskom <mwaskom@nyu.edu> Date: Mon, 15 Jun 2020 21:14:33 -0400 Subject: [PATCH 04/11] Add docstring content --- doc/api.rst | 1 + seaborn/_docstrings.py | 3 ++ seaborn/distributions.py | 57 ++++++++++++++++++++++++++++- seaborn/tests/test_distributions.py | 4 +- seaborn/tests/test_statistics.py | 5 ++- 5 files changed, 65 insertions(+), 5 deletions(-) diff --git a/doc/api.rst b/doc/api.rst index 81cde3ea95..563559c556 100644 --- a/doc/api.rst +++ b/doc/api.rst @@ -45,6 +45,7 @@ Distribution plots distplot histplot + ecdfplot kdeplot rugplot diff --git a/seaborn/_docstrings.py b/seaborn/_docstrings.py index 010dcdb8e5..a00bb79b04 100644 --- a/seaborn/_docstrings.py +++ b/seaborn/_docstrings.py @@ -116,6 +116,9 @@ def from_function_params(cls, func): """, kdeplot=""" kdeplot : Plot univariate or bivariate distributions using kernel density estimation. + """, + ecdfplot=""" +ecdfplot : Plot empirical cumulative distribution functions. """, rugplot=""" rugplot : Plot a tick at each observation value along the x and/or y axes. diff --git a/seaborn/distributions.py b/seaborn/distributions.py index 141c360b33..68e80a30d9 100644 --- a/seaborn/distributions.py +++ b/seaborn/distributions.py @@ -1475,6 +1475,7 @@ def histplot( -------- {seealso.kdeplot} {seealso.rugplot} +{seealso.ecdfplot} {seealso.jointplot} distplot @@ -1781,9 +1782,10 @@ def kdeplot( See Also -------- +{seealso.violinplot} {seealso.histplot} +{seealso.ecdfplot} {seealso.rugplot} -{seealso.violinplot} {seealso.jointplot} distplot @@ -1839,7 +1841,7 @@ def ecdfplot( # Computation parameters stat="proportion", # Hue mapping parameters - palette=None, hue_order=None, hue_norm=None, color=None, + palette=None, hue_order=None, hue_norm=None, # Axes information log_scale=None, legend=True, ax=None, # Other appearance keywords @@ -1883,6 +1885,57 @@ def ecdfplot( return ax +ecdfplot.__doc__ = """\ +Plot empirical cumulative distribution functions. + +An ECDF represents the proportion or count of observations falling below each +unique value in a dataset. Compared to a histogram or density plot, it has the +advantage that each observation is visualized directly, meaning that there are +no binning or smoothing parameters that need to be adjusted. It also aids direct +comparisons between multiple distributions. A downside is that the relationship +between the appearance of the plot and the basic properties of the distribution +(such as its central tendency, variance, and the presence of any bimodality) +may not be as intuitive. + +More information is provided in the :ref:`user guide <userguide_ecdf>`. + +Parameters +---------- +{params.core.data} +{params.core.xy} +{params.core.hue} +weights : vector or key in ``data`` + If provided, weight the contribution of the corresponding data points + towards the distribution by these factors. +stat : {{"proportion", "count"}} + Distribution statistic to compute. +{params.core.palette} +{params.core.hue_order} +{params.core.hue_norm} +{params.dist.log_scale} +{params.dist.legend} +{params.core.ax} +kwargs + Other keyword arguments are passed to :meth:`matplotlib.axes.Axes.plot`. + +Returns +------- +{returns.ax} + +See Also +-------- +{seealso.histplot} +{seealso.kdeplot} +{seealso.rugplot} +distplot + +""".format( + params=_param_docs, + returns=_core_docs["returns"], + seealso=_core_docs["seealso"], +) + + @_deprecate_positional_args def rugplot( x=None, # Allow positional x, because behavior won't change diff --git a/seaborn/tests/test_distributions.py b/seaborn/tests/test_distributions.py index 60bebb42ed..bac5f30149 100644 --- a/seaborn/tests/test_distributions.py +++ b/seaborn/tests/test_distributions.py @@ -1827,11 +1827,13 @@ def test_hue(self, long_df): def test_line_kwargs(self, long_df): + color = "r" ls = "--" lw = 3 - ax = ecdfplot(long_df, x="x", hue="a", ls=ls, lw=lw) + ax = ecdfplot(long_df, x="x", color=color, ls=ls, lw=lw) for line in ax.lines: + assert line.get_color() == to_rgb(color) assert line.get_linestyle() == ls assert line.get_linewidth() == lw diff --git a/seaborn/tests/test_statistics.py b/seaborn/tests/test_statistics.py index 4d246371ee..e51f77dcc7 100644 --- a/seaborn/tests/test_statistics.py +++ b/seaborn/tests/test_statistics.py @@ -418,7 +418,8 @@ def test_univariate_proportion_weights(self, x, weights): ecdf = ECDF() stat, vals = ecdf(x, weights=weights) assert_array_equal(vals[1:], np.sort(x)) - assert_array_almost_equal(stat[1:], weights[x.argsort()].cumsum() / weights.sum()) + expected_stats = weights[x.argsort()].cumsum() / weights.sum() + assert_array_almost_equal(stat[1:], expected_stats) assert stat[0] == 0 def test_univariate_count_weights(self, x, weights): @@ -442,4 +443,4 @@ def test_against_statsmodels(self, x): def test_invalid_stat(self, x): with pytest.raises(ValueError, match="`stat` must be one of"): - ECDF(stat="density") \ No newline at end of file + ECDF(stat="density") From a211515b4b9b1e55391563824fe67b32389a06bf Mon Sep 17 00:00:00 2001 From: Michael Waskom <mwaskom@nyu.edu> Date: Mon, 15 Jun 2020 21:52:14 -0400 Subject: [PATCH 05/11] Add more docstring information and fix test --- seaborn/_statistics.py | 36 ++++++++++++++++++++++++++++- seaborn/distributions.py | 7 ++---- seaborn/tests/test_distributions.py | 2 +- 3 files changed, 38 insertions(+), 7 deletions(-) diff --git a/seaborn/_statistics.py b/seaborn/_statistics.py index 2e31831dec..bccd088eec 100644 --- a/seaborn/_statistics.py +++ b/seaborn/_statistics.py @@ -1,3 +1,29 @@ +"""Statistical transformations for visualization. + +This module is currently private, but is being written to eventually form part +of the public API. + +The classes should behave roughly in the style of scikit-learn. + +- All data-independent parameters should be passed to the class constructor. +- Each class should impelment a default transformation that is exposed through + __call__. These are currently written for vector arguements, but I think + consuming a whole `plot_data` DataFrame and return it with transformed + variables would make more sense. +- Some class have data-dependent preprocessing that should be cached and used + multiple times (think defining histogram bins off all data and then counting + observations within each bin multiple times per data subsets). These currently + have unique names, but it would be good to have a common name. Not quite + `fit`, but something similar. +- Alternatively, the transform interface could take some information about grouping + variables and do a groupby internally. +- Some classes should define alternate transforms that might make the most sense + with a different function. For example, KDE usually evaluates the distribution + on a regular grid, but it would be useful for it to transform at the actual + datapoints. Then again, this could be controlled by a parameter at the time of + class instantiation. + +""" from distutils.version import LooseVersion from numbers import Number import numpy as np @@ -348,9 +374,17 @@ def __call__(self, x1, x2=None, weights=None): class ECDF: - + """Univariate empirical cumulative distribution estimator.""" def __init__(self, stat="proportion"): + """Initialize the class with its paramters + Parameters + ---------- + stat : {{"proportion", "count"}} + Distribution statistic to compute. + + """ + # TODO add remove_duplicates _check_argument("stat", ["count", "proportion"], stat) self.stat = stat diff --git a/seaborn/distributions.py b/seaborn/distributions.py index 68e80a30d9..58f7615e5b 100644 --- a/seaborn/distributions.py +++ b/seaborn/distributions.py @@ -76,6 +76,7 @@ dist=DocstringComponents(_dist_params), kde=DocstringComponents.from_function_params(KDE.__init__), hist=DocstringComponents.from_function_params(Histogram.__init__), + ecdf=DocstringComponents.from_function_params(ECDF.__init__), ) @@ -1904,11 +1905,7 @@ def ecdfplot( {params.core.data} {params.core.xy} {params.core.hue} -weights : vector or key in ``data`` - If provided, weight the contribution of the corresponding data points - towards the distribution by these factors. -stat : {{"proportion", "count"}} - Distribution statistic to compute. +{params.ecdf.stat} {params.core.palette} {params.core.hue_order} {params.core.hue_norm} diff --git a/seaborn/tests/test_distributions.py b/seaborn/tests/test_distributions.py index bac5f30149..98f26d63d0 100644 --- a/seaborn/tests/test_distributions.py +++ b/seaborn/tests/test_distributions.py @@ -1833,7 +1833,7 @@ def test_line_kwargs(self, long_df): ax = ecdfplot(long_df, x="x", color=color, ls=ls, lw=lw) for line in ax.lines: - assert line.get_color() == to_rgb(color) + assert to_rgb(line.get_color()) == to_rgb(color) assert line.get_linestyle() == ls assert line.get_linewidth() == lw From eef3f4b562808b37935133ff464ff231fd31fafa Mon Sep 17 00:00:00 2001 From: Michael Waskom <mwaskom@nyu.edu> Date: Tue, 16 Jun 2020 09:25:48 -0400 Subject: [PATCH 06/11] Add complementary ECDF --- seaborn/_statistics.py | 10 +++++++--- seaborn/distributions.py | 4 +++- seaborn/tests/test_distributions.py | 12 ++++++++++++ seaborn/tests/test_statistics.py | 9 +++++++-- 4 files changed, 29 insertions(+), 6 deletions(-) diff --git a/seaborn/_statistics.py b/seaborn/_statistics.py index bccd088eec..f0c8f73a37 100644 --- a/seaborn/_statistics.py +++ b/seaborn/_statistics.py @@ -375,18 +375,20 @@ def __call__(self, x1, x2=None, weights=None): class ECDF: """Univariate empirical cumulative distribution estimator.""" - def __init__(self, stat="proportion"): + def __init__(self, stat="proportion", complementary=False): """Initialize the class with its paramters Parameters ---------- stat : {{"proportion", "count"}} Distribution statistic to compute. + complementary : bool + If True, use the complementary CDF (1 - CDF) """ - # TODO add remove_duplicates _check_argument("stat", ["count", "proportion"], stat) self.stat = stat + self.complementary = complementary # Do we need bivariate ECDF? @@ -395,7 +397,6 @@ def _eval_univariate(self, x, weights): sorter = x.argsort() x = x[sorter] weights = weights[sorter] - y = weights.cumsum() if self.stat == "proportion": @@ -404,6 +405,9 @@ def _eval_univariate(self, x, weights): x = np.r_[-np.inf, x] y = np.r_[0, y] + if self.complementary: + y = y.max() - y + return y, x def __call__(self, x1, weights=None): diff --git a/seaborn/distributions.py b/seaborn/distributions.py index 58f7615e5b..4451d83ceb 100644 --- a/seaborn/distributions.py +++ b/seaborn/distributions.py @@ -1840,7 +1840,7 @@ def ecdfplot( # Vector variables x=None, y=None, hue=None, weights=None, # Computation parameters - stat="proportion", + stat="proportion", complementary=False, # Hue mapping parameters palette=None, hue_order=None, hue_norm=None, # Axes information @@ -1874,6 +1874,7 @@ def ecdfplot( estimate_kws = dict( stat=stat, + complementary=complementary, ) p.plot_univariate_ecdf( @@ -1906,6 +1907,7 @@ def ecdfplot( {params.core.xy} {params.core.hue} {params.ecdf.stat} +{params.ecdf.complementary} {params.core.palette} {params.core.hue_order} {params.core.hue_norm} diff --git a/seaborn/tests/test_distributions.py b/seaborn/tests/test_distributions.py index 98f26d63d0..fec93dc1d0 100644 --- a/seaborn/tests/test_distributions.py +++ b/seaborn/tests/test_distributions.py @@ -1849,6 +1849,18 @@ def test_proportion_limits(self, flat_series, data_var, stat_var): sticky_edges = getattr(ax.lines[0].sticky_edges, stat_var) assert sticky_edges[:] == [0, 1] + @pytest.mark.parametrize( + "data_var,stat_var", [["x", "y"], ["y", "x"]], + ) + def test_proportion_limits_complementary(self, flat_series, data_var, stat_var): + + ax = ecdfplot(**{data_var: flat_series}, complementary=True) + data = getattr(ax.lines[0], f"get_{stat_var}data")() + assert data[0] == 1 + assert data[-1] == 0 + sticky_edges = getattr(ax.lines[0].sticky_edges, stat_var) + assert sticky_edges[:] == [0, 1] + @pytest.mark.parametrize( "data_var,stat_var", [["x", "y"], ["y", "x"]], ) diff --git a/seaborn/tests/test_statistics.py b/seaborn/tests/test_statistics.py index e51f77dcc7..b6661ce7b3 100644 --- a/seaborn/tests/test_statistics.py +++ b/seaborn/tests/test_statistics.py @@ -433,13 +433,18 @@ def test_univariate_count_weights(self, x, weights): @pytest.mark.skipif(smdist is None, reason="Requires statsmodels") def test_against_statsmodels(self, x): + sm_ecdf = smdist.empirical_distribution.ECDF(x) + ecdf = ECDF() stat, vals = ecdf(x) - - sm_ecdf = smdist.empirical_distribution.ECDF(x) assert_array_equal(vals, sm_ecdf.x) assert_array_almost_equal(stat, sm_ecdf.y) + ecdf = ECDF(complementary=True) + stat, vals = ecdf(x) + assert_array_equal(vals, sm_ecdf.x) + assert_array_almost_equal(stat, sm_ecdf.y[::-1]) + def test_invalid_stat(self, x): with pytest.raises(ValueError, match="`stat` must be one of"): From aca54c8a7f4aa0f8ac3fdcbea157e286ea0dec53 Mon Sep 17 00:00:00 2001 From: Michael Waskom <mwaskom@nyu.edu> Date: Tue, 16 Jun 2020 10:01:24 -0400 Subject: [PATCH 07/11] Add ecdfplot API examples --- doc/docstrings/ecdfplot.ipynb | 130 ++++++++++++++++++++++++++++++++++ doc/docstrings/histplot.ipynb | 2 +- seaborn/distributions.py | 10 ++- 3 files changed, 140 insertions(+), 2 deletions(-) create mode 100644 doc/docstrings/ecdfplot.ipynb diff --git a/doc/docstrings/ecdfplot.ipynb b/doc/docstrings/ecdfplot.ipynb new file mode 100644 index 0000000000..ef41c7c8c9 --- /dev/null +++ b/doc/docstrings/ecdfplot.ipynb @@ -0,0 +1,130 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Plot a univariate distribution along the x axis:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import seaborn as sns; sns.set()\n", + "penguins = sns.load_dataset(\"penguins\")\n", + "sns.ecdfplot(data=penguins, x=\"flipper_length_mm\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Flip the plot by assigning the data variable to the y axis:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "sns.ecdfplot(data=penguins, y=\"flipper_length_mm\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "If neither `x` nor `y` is assigned, the dataset is treated as wide-form, and a histogram is drawn for each numeric column:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "sns.ecdfplot(data=penguins.filter(like=\"culmen_\", axis=\"columns\"))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "You can also draw multiple histograms from a long-form dataset with hue mapping:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "sns.ecdfplot(data=penguins, x=\"culmen_length_mm\", hue=\"species\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The default distribution statistic is normalized to show a proportion, but you can show absolute counts instead:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "sns.ecdfplot(data=penguins, x=\"culmen_length_mm\", hue=\"species\", stat=\"count\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "It's also possible to plot the empirical complementary CDF (1 - CDF):" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "sns.ecdfplot(data=penguins, x=\"culmen_length_mm\", hue=\"species\", complementary=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "seaborn-refactor (py38)", + "language": "python", + "name": "seaborn-refactor" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.2" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/doc/docstrings/histplot.ipynb b/doc/docstrings/histplot.ipynb index 2efde821ef..8e8fc5f7e1 100644 --- a/doc/docstrings/histplot.ipynb +++ b/doc/docstrings/histplot.ipynb @@ -103,7 +103,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "You can also draw multiple histograms from a long-form dataset with hue mapping:" + "You can otherwise draw multiple histograms from a long-form dataset with hue mapping:" ] }, { diff --git a/seaborn/distributions.py b/seaborn/distributions.py index 4451d83ceb..4064921867 100644 --- a/seaborn/distributions.py +++ b/seaborn/distributions.py @@ -1146,8 +1146,11 @@ def plot_univariate_ecdf(self, estimate_kws, legend, plot_kws, ax): "hue", reverse=True, from_comp_data=True, ): - # "Compute" the ECDF + # Compute the ECDF sub_data = sub_data[cols].dropna() + if sub_data.empty: + continue + observations = sub_data[self.data_variable] weights = sub_data.get("weights", None) stat, vals = estimator(observations, weights) @@ -1928,6 +1931,11 @@ def ecdfplot( {seealso.rugplot} distplot +Examples +-------- + +.. include:: ../docstrings/ecdfplot.rst + """.format( params=_param_docs, returns=_core_docs["returns"], From 5b4f3d27bf43860b9e3fd3d04c2b794aaaca1be3 Mon Sep 17 00:00:00 2001 From: Michael Waskom <mwaskom@nyu.edu> Date: Tue, 16 Jun 2020 10:38:00 -0400 Subject: [PATCH 08/11] Fix step plots with y data variable --- seaborn/distributions.py | 13 +++++++++---- seaborn/tests/test_distributions.py | 7 +++++++ 2 files changed, 16 insertions(+), 4 deletions(-) diff --git a/seaborn/distributions.py b/seaborn/distributions.py index 4064921867..05a9d1b883 100644 --- a/seaborn/distributions.py +++ b/seaborn/distributions.py @@ -540,8 +540,12 @@ def plot_univariate_histogram( y = np.append(hist["heights"], final["heights"]) b = np.append(bottom, bottom[-1]) - step = "post" - drawstyle = "steps-post" + if self.data_variable == "x": + step = "post" + drawstyle = "steps-post" + else: + step = "post" # fillbetweenx handles mapping internally + drawstyle = "steps-pre" elif element == "poly": @@ -1138,8 +1142,9 @@ def plot_univariate_ecdf(self, estimate_kws, legend, plot_kws, ax): # TODO maybe have an option for joint start/end? estimator = ECDF(**estimate_kws) - # Allow other drawstyles (I'm not sure why you'd want them) - plot_kws.setdefault("drawstyle", "steps-post") + # Set the draw style to step the right way for the data varible + drawstyles = dict(x="steps-post", y="steps-pre") + plot_kws["drawstyle"] = drawstyles[self.data_variable] # Loop through the subsets, transform and plot the data for sub_vars, sub_data in self._semantic_subsets( diff --git a/seaborn/tests/test_distributions.py b/seaborn/tests/test_distributions.py index fec93dc1d0..2593eca382 100644 --- a/seaborn/tests/test_distributions.py +++ b/seaborn/tests/test_distributions.py @@ -1837,6 +1837,13 @@ def test_line_kwargs(self, long_df): assert line.get_linestyle() == ls assert line.get_linewidth() == lw + @pytest.mark.parametrize("data_var", ["x", "y"]) + def test_drawstyle(self, flat_series, data_var): + + ax = ecdfplot(**{data_var: flat_series}) + drawstyles = dict(x="steps-post", y="steps-pre") + assert ax.lines[0].get_drawstyle() == drawstyles[data_var] + @pytest.mark.parametrize( "data_var,stat_var", [["x", "y"], ["y", "x"]], ) From 69076caac6e21c0635a845b42c78041818cdfae2 Mon Sep 17 00:00:00 2001 From: Michael Waskom <mwaskom@nyu.edu> Date: Tue, 16 Jun 2020 10:57:16 -0400 Subject: [PATCH 09/11] Housekeeping --- seaborn/_statistics.py | 11 ++++++++--- seaborn/distributions.py | 1 - seaborn/tests/test_statistics.py | 6 ++++++ 3 files changed, 14 insertions(+), 4 deletions(-) diff --git a/seaborn/_statistics.py b/seaborn/_statistics.py index f0c8f73a37..9f6b8dff88 100644 --- a/seaborn/_statistics.py +++ b/seaborn/_statistics.py @@ -390,7 +390,9 @@ def __init__(self, stat="proportion", complementary=False): self.stat = stat self.complementary = complementary - # Do we need bivariate ECDF? + def _eval_bivariate(self, x1, x2, weights): + """Inner function for ECDF of two variables.""" + raise NotImplementedError def _eval_univariate(self, x, weights): """Inner function for ECDF of one variable.""" @@ -410,7 +412,7 @@ def _eval_univariate(self, x, weights): return y, x - def __call__(self, x1, weights=None): + def __call__(self, x1, x2=None, weights=None): """Return proportion or count of observations below each sorted datapoint.""" x1 = np.asarray(x1) if weights is None: @@ -418,4 +420,7 @@ def __call__(self, x1, weights=None): else: weights = np.asarray(weights) - return self._eval_univariate(x1, weights) + if x2 is None: + return self._eval_univariate(x1, weights) + else: + return self._eval_bivariate(x1, x2, weights) diff --git a/seaborn/distributions.py b/seaborn/distributions.py index 05a9d1b883..54d06a9afe 100644 --- a/seaborn/distributions.py +++ b/seaborn/distributions.py @@ -1139,7 +1139,6 @@ def plot_univariate_ecdf(self, estimate_kws, legend, plot_kws, ax): # TODO see notes elsewhere about GH2135 cols = list(self.variables) - # TODO maybe have an option for joint start/end? estimator = ECDF(**estimate_kws) # Set the draw style to step the right way for the data varible diff --git a/seaborn/tests/test_statistics.py b/seaborn/tests/test_statistics.py index b6661ce7b3..4590849e3a 100644 --- a/seaborn/tests/test_statistics.py +++ b/seaborn/tests/test_statistics.py @@ -449,3 +449,9 @@ def test_invalid_stat(self, x): with pytest.raises(ValueError, match="`stat` must be one of"): ECDF(stat="density") + + def test_bivariate_error(self, x, y): + + with pytest.raises(NotImplementedError, match="Bivariate ECDF"): + ecdf = ECDF() + ecdf(x, y) From 394669bcb560281467598e4c6b73ac457281bd8a Mon Sep 17 00:00:00 2001 From: Michael Waskom <mwaskom@nyu.edu> Date: Tue, 16 Jun 2020 11:16:12 -0400 Subject: [PATCH 10/11] Fix error message --- seaborn/_statistics.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/seaborn/_statistics.py b/seaborn/_statistics.py index 9f6b8dff88..40d5801364 100644 --- a/seaborn/_statistics.py +++ b/seaborn/_statistics.py @@ -392,7 +392,7 @@ def __init__(self, stat="proportion", complementary=False): def _eval_bivariate(self, x1, x2, weights): """Inner function for ECDF of two variables.""" - raise NotImplementedError + raise NotImplementedError("Bivariate ECDF is not implemented") def _eval_univariate(self, x, weights): """Inner function for ECDF of one variable.""" From f8fc9152042427af70daba79c84f86d73ad98350 Mon Sep 17 00:00:00 2001 From: Michael Waskom <mwaskom@nyu.edu> Date: Tue, 16 Jun 2020 20:51:05 -0400 Subject: [PATCH 11/11] Mention ecdfplot in release notes --- doc/releases/v0.11.0.txt | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/doc/releases/v0.11.0.txt b/doc/releases/v0.11.0.txt index 53c6690c5c..d6e24b693a 100644 --- a/doc/releases/v0.11.0.txt +++ b/doc/releases/v0.11.0.txt @@ -9,7 +9,9 @@ v0.11.0 (Unreleased) Modernization of distribution functions ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -First, a new function, :func:`histplot` has been added. :func:`histplot` draws univariate or bivariate histograms with a number of features, including: +First, two new functions, :func:`histplot` and :func:`ecdfplot` has been added. + +:func:`histplot` draws univariate or bivariate histograms with a number of features, including: - mapping multiple distributions with a ``hue`` semantic - normalization to show density, probability, or frequency statistics @@ -17,6 +19,8 @@ First, a new function, :func:`histplot` has been added. :func:`histplot` draws u - adding a KDE fit to show a smoothed distribution over all bin statistics - experimental support for histograms over categorical and datetime variables. GH2125 +:func:`ecdfplot` draws univariate empirical cumulative distribution functions, using a similar interface. + Second, the existing functions :func:`kdeplot` and :func:`rugplot` have been completely overhauled. Two of the oldest functions in the library, these lacked aspects of the otherwise-common seaborn API, such as the ability to assign variables by name from a ``data`` object; they had no capacity for semantic mapping; and they had numerous other inconsistencies and smaller issues. The overhauled functions now share a common API with the rest of seaborn, they can show conditional distributions by mapping a third variable with a ``hue`` semantic, and have been improved in numerous other ways. The `github pull request (GH2104) <https://github.com/mwaskom/seaborn/pull/2104>`_ has a longer explanation of the changes and the motivation behind them.