First steps towards implementing DataSource, with issues

mwaskom · May 19, 2021 · 4ca5964 · 4ca5964
1 parent ecd5773
commit 4ca5964
Showing 1 changed file with 208 additions and 12 deletions.
diff --git a/seaborn/_new_core.py b/seaborn/_new_core.py
@@ -9,6 +9,7 @@
 from pandas import DataFrame, Series
 
 
+# TODO ndarray can be the numpy ArrayLike on 1.20+ (?)
 Vector = Union[Series, ndarray, Sequence, Number]
 
 # TODO Should we define a DataFrame-like type that is Union[DataFrame, Mapping]?
@@ -47,15 +48,16 @@ def add(
 
         return self
 
-    def plot(self) -> Plot:
+    # TODO problem with "draw" meaning something specific in mpl?
+    def draw(self) -> Plot:
 
         # TODO or something like this
         for layer in self.layers:
-            self._plot_layer(layer)
+            self._draw_layer(layer)
 
         return self
 
-    def _plot_layer(self, layer):
+    def _draw_layer(self, layer):
 
         # Roughly ...
 
@@ -71,15 +73,15 @@ def _plot_layer(self, layer):
         data = self.invert_scale(data)
 
         # Something like this?
-        layer.mark.plot(data)  # do we pass ax (and/or facets?! here?)
+        layer.mark._draw(data)  # do we pass ax (and/or facets?! here?)
 
     def show(self) -> Plot:
 
         # TODO guard this here?
         # We could have the option to be totally pyplot free
         # in which case this method would raise
         import matplotlib.pyplot as plt
-        self.plot()
+        self.draw()
         plt.show()
 
         return self
@@ -122,13 +124,166 @@ class DataSource:
 
     # Who owns the existing VectorPlotter.variables, VectorPlotter.var_levels, etc.?
 
-    def __init__(self, data, variables):
+    def __init__(
+        self,
+        data: Union[DataFrame, Mapping, None],
+        variables: dict[str, Union[str, Vector]],
+        # TODO pass in wide semantics?
+    ):
+
+        # TODO only specing out with long-form data for now...
+        data, names = self._assign_variables_longform(data, variables)
+
+        self._data = data
+        self._names = names
+
+    def join(self, other: DataSource) -> None:  # TODO return self?
 
-        pass
+        # TODO Define this as a left join I guess
 
-    def join(self, other: DataSource):
+        # TODO If the idea is that the layer-specific data source will be on the
+        # left, then we can essentially do an update with other's plot_data and
+        # var names. But that is tricky because each layer will rarely (probably)
+        # define x/y, meaning that naively we would think we have wide-form data
 
-        pass
+        # TODO also we need to account for when we just get names at the layer
+        # and they reference fields in the original data. Maybe the signature
+        # here should chance to data, variables? (But that would reverse the
+        # assumed order of what is self and what is other)
+
+        # TODO also, we want to be able to disable global semantics by passing None
+        # here, which probably needs special handling as currently we'll end up
+        # with a column of Nones which is otherwise gonna get us in trouble...
+
+        data = (
+            self._data
+            .drop(other._data.columns, axis=1)
+            .join(other._data, how="left", copy=False)  # TODO check copy
+        )
+
+        names = self._names.copy()
+        names.update(other._names)
+
+        self._data = data
+        self._names = names
+
+    def _assign_variables_longform(
+        self,
+        data: Union[DataFrame, Mapping, None],
+        variables: dict[str, Union[str, Vector]]
+    ) -> tuple[DataFrame, dict[str, Union[str, None]]]:
+        """Define plot variables given long-form data and/or vector inputs.
+
+        Parameters
+        ----------
+        data
+            Input data where variable names map to vector values.
+        variables
+            Keys are seaborn variables (x, y, hue, ...) and values are vectors
+            in any format that can construct a :class:`pandas.DataFrame` or
+            names of columns or index levels in ``data``.
+
+        Returns
+        -------
+        plot_data : :class:`pandas.DataFrame`
+            Long-form data object mapping seaborn variables (x, y, hue, ...)
+            to data vectors.
+        variables : dict
+            Keys are defined seaborn variables; values are names inferred from
+            the inputs (or None when no name can be determined).
+
+        Raises
+        ------
+        ValueError
+            When variables are strings that don't appear in ``data``.
+
+        """
+        plot_data = {}
+        var_names = {}
+
+        # Data is optional; all variables can be defined as vectors
+        if data is None:
+            data = {}
+
+        # TODO should we try a data.to_dict() or similar here to more
+        # generally accept objects with that interface?
+        # Note that dict(df) also works for pandas, and gives us what we
+        # want, whereas DataFrame.to_dict() gives a nested dict instead of
+        # a dict of series.
+
+        # Variables can also be extraced from the index attribute
+        # TODO is this the most general way to enable it?
+        # There is no index.to_dict on multiindex, unfortunately
+        try:
+            index = data.index.to_frame()
+        except AttributeError:
+            index = {}
+
+        # The caller will determine the order of variables in plot_data
+        for key, val in variables.items():
+
+            # First try to treat the argument as a key for the data collection.
+            # But be flexible about what can be used as a key.
+            # Usually it will be a string, but allow numbers or tuples too when
+            # taking from the main data object. Only allow strings to reference
+            # fields in the index, because otherwise there is too much ambiguity.
+            try:
+                val_as_data_key = (
+                    val in data
+                    or (isinstance(val, (str, bytes)) and val in index)
+                )
+            except (KeyError, TypeError):
+                val_as_data_key = False
+
+            if val_as_data_key:
+
+                # We know that __getitem__ will work
+
+                if val in data:
+                    plot_data[key] = data[val]
+                elif val in index:
+                    plot_data[key] = index[val]
+                var_names[key] = val
+
+            elif isinstance(val, (str, bytes)):
+
+                # This looks like a column name but we don't know what it means!
+
+                err = f"Could not interpret value `{val}` for parameter `{key}`"
+                raise ValueError(err)
+
+            else:
+
+                # Otherwise, assume the value is itself data
+
+                # Raise when data object is present and a vector can't matched
+                if isinstance(data, pd.DataFrame) and not isinstance(val, pd.Series):
+                    if np.ndim(val) and len(data) != len(val):
+                        val_cls = val.__class__.__name__
+                        err = (
+                            f"Length of {val_cls} vectors must match length of `data`"
+                            f" when both are used, but `data` has length {len(data)}"
+                            f" and the vector passed to `{key}` has length {len(val)}."
+                        )
+                        raise ValueError(err)
+
+                plot_data[key] = val
+
+                # Try to infer the name of the variable
+                var_names[key] = getattr(val, "name", None)
+
+        # Construct a tidy plot DataFrame. This will convert a number of
+        # types automatically, aligning on index in case of pandas objects
+        plot_data = pd.DataFrame(plot_data)
+
+        # Reduce the variables dictionary to fields with valid data
+        var_names = {
+            var: name
+            for var, name in var_names.items()
+            if plot_data[var].notnull().any()
+        }
+
+        return plot_data, var_names
 
 
 class Stat:
@@ -148,11 +303,52 @@ class Mark:
 
 class Point(Mark):
 
-    def _plot(self, data_gen):  # TODO data_gen is maybe too restrictive a name?
+    def __init__(self, **kwargs):
+
+        self.kwargs = kwargs
+
+    def _draw(self, plot):  # TODO data_gen is maybe too restrictive a name?
+
+        kws = self.kwargs.copy()
+
+        for keys, data, ax in plot.data_gen(self.group_vars):
+
+            # Define the vectors of x and y positions
+            empty = np.full(len(data), np.nan)
+            x = data.get("x", empty)
+            y = data.get("y", empty)
+
+            # Set defaults for other visual attributes
+            kws.setdefault("edgecolor", "w")
+
+            if "style" in data:
+                # Use a representative marker so scatter sets the edgecolor
+                # properly for line art markers. We currently enforce either
+                # all or none line art so this works.
+                example_level = self._style_map.levels[0]
+                example_marker = self._style_map(example_level, "marker")
+                kws.setdefault("marker", example_marker)
+
+            # TODO this makes it impossible to vary alpha with hue which might
+            # otherwise be useful? Should we just pass None?
+            kws["alpha"] = 1 if self.alpha == "auto" else self.alpha
+
+            # Draw the scatter plot
+            points = ax.scatter(x=x, y=y, **kws)
+
+            # Apply the mapping from semantic variables to artist attributes
+
+            if "hue" in self.variables:
+                points.set_facecolors(self._hue_map(data["hue"]))
+
+            if "size" in self.variables:
+                points.set_sizes(self._size_map(data["size"]))
 
-        for keys, data, ax in data_gen(self.group_vars):
+            if "style" in self.variables:
+                p = [self._style_map(val, "path") for val in data["style"]]
+                points.set_paths(p)
 
-            pass
+            # Apply dependant default attributes
 
 
 class Layer: