Standardize parameter names in distributions module (#2060)

* Standardize parameter names in distributions module * Compat with older statsmodels * Improve kdeplot tests
mwaskom · May 9, 2020 · dde9f5e · dde9f5e
1 parent 09b0ebb
commit dde9f5e
Show file tree

Hide file tree

Showing 3 changed files with 119 additions and 52 deletions.
diff --git a/doc/releases/v0.11.0.txt b/doc/releases/v0.11.0.txt
@@ -4,6 +4,8 @@ v0.11.0 (Unreleased)
 
 - Enforced keyword-only arguments for most parameters of most functions and classes.
 
+- Standardized the parameter names for the oldest functions (:func:`distplot`, :func:`kdeplot`, and :func:`rugplot`) to be `x` and `y`, as in other functions. Using the old names will warn now and break in the future.
+
 - Added an explicit warning in :func:`swarmplot` when more than 2% of the points are overlap in the "gutters" of the swarm.
 
 - Added the ``axes_dict`` attribute to :class:`FacetGrid` for named access to the component axes.

diff --git a/seaborn/distributions.py b/seaborn/distributions.py
@@ -38,12 +38,11 @@ def _freedman_diaconis_bins(a):
 
 @_deprecate_positional_args
 def distplot(
-    a,
-    *,
+    x=None, *,
     bins=None, hist=True, kde=True, rug=False, fit=None,
     hist_kws=None, kde_kws=None, rug_kws=None, fit_kws=None,
     color=None, vertical=False, norm_hist=False, axlabel=None,
-    label=None, ax=None
+    label=None, ax=None, a=None,
 ):
     """Flexibly plot a univariate distribution of observations.
 
@@ -55,7 +54,7 @@ def distplot(
     Parameters
     ----------
 
-    a : Series, 1d-array, or list.
+    x : Series, 1d-array, or list.
         Observed data. If this is a Series object with a ``name`` attribute,
         the name will be used to label the data axis.
     bins : argument for matplotlib hist(), or None, optional
@@ -169,6 +168,14 @@ def distplot(
         ...                             "alpha": 1, "color": "g"})
 
     """
+    # Handle deprecation of ``a```
+    if a is not None:
+        msg = "The `a` parameter is now called `x`. Please update your code."
+        warnings.warn(msg)
+    else:
+        a = x  # TODO refactor
+
+    # Default to drawing on the currently-active axes
     if ax is None:
         ax = plt.gca()
 
@@ -363,6 +370,9 @@ def _univariate_kdeplot(data, shade, vertical, kernel, bw, gridsize, cut,
 def _statsmodels_univariate_kde(data, kernel, bw, gridsize, cut, clip,
                                 cumulative=False):
     """Compute a univariate kernel density estimate using statsmodels."""
+    # statsmodels 0.8 fails on int type data
+    data = data.astype(np.float64)
+
     fft = kernel == "gau"
     kde = smnp.KDEUnivariate(data)
 
@@ -464,6 +474,10 @@ def _bivariate_kdeplot(x, y, filled, fill_lowest,
 
 def _statsmodels_bivariate_kde(x, y, bw, gridsize, cut, clip):
     """Compute a bivariate kde using statsmodels."""
+    # statsmodels 0.8 fails on int type data
+    x = x.astype(np.float64)
+    y = y.astype(np.float64)
+
     if isinstance(bw, str):
         bw_func = getattr(smnp.bandwidths, "bw_" + bw)
         x_bw = bw_func(x)
@@ -509,21 +523,21 @@ def _scipy_bivariate_kde(x, y, bw, gridsize, cut, clip):
 
 @_deprecate_positional_args
 def kdeplot(
-    data, data2=None,
-    *,
+    x=None, y=None, *,
     shade=False, vertical=False, kernel="gau",
     bw="scott", gridsize=100, cut=3, clip=None, legend=True,
     cumulative=False, shade_lowest=True, cbar=False, cbar_ax=None,
     cbar_kws=None, ax=None,
+    data=None, data2=None,  # TODO move data once * is enforced
     **kwargs,
 ):
     """Fit and plot a univariate or bivariate kernel density estimate.
 
     Parameters
     ----------
-    data : 1d array-like
+    x : 1d array-like
         Input data.
-    data2: 1d array-like, optional
+    y: 1d array-like, optional
         Second input data. If present, a bivariate KDE will be estimated.
     shade : bool, optional
         If True, shade in the area under the KDE curve (or draw with filled
@@ -664,52 +678,44 @@ def kdeplot(
         ...                  cmap="Blues", shade=True, shade_lowest=False)
 
     """
-    if ax is None:
-        ax = plt.gca()
-
-    if isinstance(data, list):
-        data = np.asarray(data)
-
-    if len(data) == 0:
-        return ax
+    # Handle deprecation of `data` as name for x variable
+    # TODO this can be removed once refactored to do centralized preprocessing
+    # of input variables, because a vector input to `data` will be treated like
+    # an input to `x`. Warning is probably not necessary.
+    x_passed_as_data = (
+        x is None
+        and data is not None
+        and np.ndim(data) == 1
+    )
+    if x_passed_as_data:
+        x = data
 
-    data = data.astype(np.float64)
+    # Handle deprecation of `data2` as name for y variable
     if data2 is not None:
-        if isinstance(data2, list):
-            data2 = np.asarray(data2)
-        data2 = data2.astype(np.float64)
-
-    warn = False
-    bivariate = False
-    if isinstance(data, np.ndarray) and np.ndim(data) > 1:
-        warn = True
-        bivariate = True
-        x, y = data.T
-    elif isinstance(data, pd.DataFrame) and np.ndim(data) > 1:
-        warn = True
-        bivariate = True
-        x = data.iloc[:, 0].values
-        y = data.iloc[:, 1].values
-    elif data2 is not None:
-        bivariate = True
-        x = data
+        msg = "The `data2` param is now named `y`; please update your code."
+        warnings.warn(msg)
         y = data2
 
-    if warn:
-        warn_msg = ("Passing a 2D dataset for a bivariate plot is deprecated "
-                    "in favor of kdeplot(x, y), and it will cause an error in "
-                    "future versions. Please update your code.")
-        warnings.warn(warn_msg, UserWarning)
+    # TODO replace this preprocessing with central refactoring
+    if isinstance(x, list):
+        x = np.asarray(x)
+    if isinstance(y, list):
+        y = np.asarray(y)
 
+    bivariate = x is not None and y is not None
     if bivariate and cumulative:
         raise TypeError("Cumulative distribution plots are not"
                         "supported for bivariate distributions.")
+
+    if ax is None:
+        ax = plt.gca()
+
     if bivariate:
         ax = _bivariate_kdeplot(x, y, shade, shade_lowest,
                                 kernel, bw, gridsize, cut, clip, legend,
                                 cbar, cbar_ax, cbar_kws, ax, **kwargs)
     else:
-        ax = _univariate_kdeplot(data, shade, vertical, kernel, bw,
+        ax = _univariate_kdeplot(x, shade, vertical, kernel, bw,
                                  gridsize, cut, clip, legend, ax,
                                  cumulative=cumulative, **kwargs)
 
@@ -718,16 +724,16 @@ def kdeplot(
 
 @_deprecate_positional_args
 def rugplot(
-    a,
-    *,
+    x=None, *,
     height=.05, axis="x", ax=None,
+    a=None,
     **kwargs
 ):
     """Plot datapoints in an array as sticks on an axis.
 
     Parameters
     ----------
-    a : vector
+    x : vector
         1D array of observations.
     height : scalar, optional
         Height of ticks as proportion of the axis.
@@ -744,6 +750,14 @@ def rugplot(
         The Axes object with the plot on it.
 
     """
+    # Handle deprecation of ``a```
+    if a is not None:
+        msg = "The `a` parameter is now called `x`. Please update your code."
+        warnings.warn(msg)
+    else:
+        a = x  # TODO refactor
+
+    # Default to drawing on the currently active axes
     if ax is None:
         ax = plt.gca()
     a = np.asarray(a)

diff --git a/seaborn/tests/test_distributions.py b/seaborn/tests/test_distributions.py
@@ -1,3 +1,4 @@
+import itertools
 import numpy as np
 import pandas as pd
 import matplotlib.pyplot as plt
@@ -6,16 +7,13 @@
 import pytest
 import nose.tools as nt
 import numpy.testing as npt
-from distutils.version import LooseVersion
 
 from .. import distributions as dist
 
 _no_statsmodels = not dist._has_statsmodels
 
 if not _no_statsmodels:
-    import statsmodels
     import statsmodels.nonparametric as smnp
-    _old_statsmodels = LooseVersion(statsmodels.__version__) < "0.11"
 else:
     _old_statsmodels = False
 
@@ -88,6 +86,13 @@ def test_distplot_with_nans(self):
             assert bar1.get_xy() == bar2.get_xy()
             assert bar1.get_height() == bar2.get_height()
 
+    def test_a_parameter_deprecation(self):
+
+        n = 10
+        with pytest.warns(UserWarning):
+            ax = dist.distplot(a=self.x, bins=n)
+        assert len(ax.patches) == n
+
 
 class TestKDE(object):
 
@@ -100,6 +105,38 @@ class TestKDE(object):
     clip = (-np.inf, np.inf)
     cut = 3
 
+    def test_kde_1d_input_output(self):
+        """Test that array/series/list inputs give the same output."""
+        f, ax = plt.subplots()
+
+        dist.kdeplot(x=self.x)
+        dist.kdeplot(x=pd.Series(self.x))
+        dist.kdeplot(x=self.x.tolist())
+        dist.kdeplot(data=self.x)
+
+        supports = [l.get_xdata() for l in ax.lines]
+        for a, b in itertools.product(supports, supports):
+            assert np.array_equal(a, b)
+
+        densities = [l.get_ydata() for l in ax.lines]
+        for a, b in itertools.product(densities, densities):
+            assert np.array_equal(a, b)
+
+    def test_kde_2d_input_output(self):
+        """Test that array/series/list inputs give the same output."""
+        f, ax = plt.subplots()
+
+        dist.kdeplot(x=self.x, y=self.y)
+        dist.kdeplot(x=pd.Series(self.x), y=pd.Series(self.y))
+        dist.kdeplot(x=self.x.tolist(), y=self.y.tolist())
+
+        contours = ax.collections
+        n = len(contours) // 3
+
+        for i in range(n):
+            for a, b in itertools.product(contours[i::n], contours[i::n]):
+                assert np.array_equal(a.get_offsets(), b.get_offsets())
+
     def test_scipy_univariate_kde(self):
         """Test the univariate KDE estimation with scipy."""
         grid, y = dist._scipy_univariate_kde(self.x, self.bw, self.gridsize,
@@ -171,7 +208,7 @@ def test_kde_cummulative_2d(self):
             dist.kdeplot(self.x, self.y, cumulative=True)
 
     def test_kde_singular(self):
-
+        """Check that kdeplot warns and skips on singular inputs."""
         with pytest.warns(UserWarning):
             ax = dist.kdeplot(np.ones(10))
         line = ax.lines[0]
@@ -182,8 +219,13 @@ def test_kde_singular(self):
         line = ax.lines[1]
         assert not line.get_xydata().size
 
-    @pytest.mark.skipif(_no_statsmodels or _old_statsmodels,
-                        reason="no statsmodels or statsmodels without issue")
+    def test_data2_input_deprecation(self):
+        """Using data2 kwarg should warn but still draw a bivariate plot."""
+        with pytest.warns(UserWarning):
+            ax = dist.kdeplot(self.x, data2=self.y)
+        assert len(ax.collections)
+
+    @pytest.mark.skipif(_no_statsmodels, reason="no statsmodels")
     def test_statsmodels_zero_bandwidth(self):
         """Test handling of 0 bandwidth data in statsmodels."""
         x = np.zeros(100)
@@ -196,8 +238,9 @@ def test_statsmodels_zero_bandwidth(self):
         except RuntimeError:
 
             # Only execute the actual test in the except clause, this should
-            # keep the test from failing in the future if statsmodels changes
-            # it's behavior to avoid raising the error itself.
+            # allot the test to pass on versions of statsmodels predating 0.11
+            # and keep the test from failing in the future if statsmodels
+            # reverts its behavior to avoid raising the error in the futures
             # Track at https://github.com/statsmodels/statsmodels/issues/5419
 
             with pytest.warns(UserWarning):
@@ -339,3 +382,11 @@ def test_rugplot(self, list_data, array_data, series_data):
             assert np.squeeze(c.get_linewidth()).item() == lw
         assert c.get_alpha() == .5
         plt.close(f)
+
+    def test_a_parameter_deprecation(self, series_data):
+
+        with pytest.warns(UserWarning):
+            ax = dist.rugplot(a=series_data)
+        rug, = ax.collections
+        segments = np.array(rug.get_segments())
+        assert len(segments) == len(series_data)