Add prototype for unit conversion and scaling

mwaskom · Jun 3, 2021 · 28f9297 · 28f9297
1 parent 7e06479
commit 28f9297
Showing 1 changed file with 113 additions and 21 deletions.
diff --git a/seaborn/_new_core.py b/seaborn/_new_core.py
@@ -119,6 +119,11 @@ def facet(
         # TODO do we want to allow this method to be optional and create
         # facets if col or row are defined in Plot()? More convenient...
 
+        # TODO another option would be to have this signature be like
+        # facet(dim, order, wrap, share)
+        # and expect to call it twice for column and row faceting
+        # (or have facet_col, facet_row)?
+
         # TODO what should this data structure be?
         # We can't initialize a FacetGrid here because that will open a figure
         orders = {"col": col_order, "row": row_order}
@@ -211,13 +216,16 @@ def _setup_figure(self):
         # TODO add external API for parameterizing figure, etc.
         # TODO add external API for parameterizing FacetGrid if using
         # TODO add external API for passing existing ax (maybe in same method)
+        # TODO add object that handles the "FacetGrid or single Axes?" abstractions
 
         if not hasattr(self, "_facetspec"):
             self.facet()  # TODO a good way to activate defaults?
 
         # TODO use context manager with theme that has been set
         # TODO (or maybe wrap THIS function with context manager; would be cleaner)
+
         if self._facetspec:
+
             facet_data = pd.DataFrame()
             facet_vars = {}
             for dim in ["row", "col"]:
@@ -229,13 +237,25 @@ def _setup_figure(self):
                     facet_vars["col_wrap"] = self._facetspec[dim]["wrap"]
             grid = FacetGrid(facet_data, **facet_vars, pyplot=False)
             grid.set_titles()
+
+            if len(facet_vars) > 2:
+                zipped = zip(facet_data["row"], facet_data["col"])
+                facet_keys = pd.Series(zipped, index=facet_data.index)
+            else:
+                facet_keys = facet_data.squeeze().astype("category")
+            axes_map = facet_keys.map(grid.axes_dict)
+
             self._figure = grid.fig
-            self._facets = grid
             self._ax = None
+            self._facets = grid
+            self._axes_map = axes_map
+
         else:
+
             self._figure = Figure()
-            self._facets = None
             self._ax = self._figure.add_subplot()
+            self._facets = None
+            self._axes_map = None
 
         # TODO good place to do this? (needs to handle FacetGrid)
         obj = self._ax if self._facets is None else self._facets
@@ -244,6 +264,9 @@ def _setup_figure(self):
             if name is not None:
                 obj.set(**{f"{axis}label": name})
 
+        # TODO in current _attach, we initialize the units at this point
+        # TODO we will also need to incorporate the scaling that (could) be set
+
     def _setup_mappings(self) -> dict[str, SemanticMapping]:  # TODO literal key
 
         all_data = pd.concat([layer.data.frame for layer in self._layers])
@@ -283,56 +306,124 @@ def _plot_layer(self, layer, mappings):
         data = layer.data
         stat = layer.stat
 
-        # TODO where does this method come from?
-        # data = self.as_numeric(layer.data)
+        df = self._scale_coords(data.frame)
 
         # TODO how to we handle orientation?
         # TODO how can we special-case fast aggregations? (i.e. mean, std, etc.)
         # TODO should we pass the grouping variables to the Stat and let it handle that?
-        stat_grouping_vars = [var for var in grouping_vars if var in data]
-        if stat.orient not in stat_grouping_vars:
-            stat_grouping_vars.append(stat.orient)
         if stat is not None:  # TODO or default to Identity, but we'll have groupby cost
-            data.frame = (
-                data.frame
+            stat_grouping_vars = [var for var in grouping_vars if var in data]
+            if stat.orient not in stat_grouping_vars:
+                stat_grouping_vars.append(stat.orient)
+            df = (
+                df
                 .groupby(stat_grouping_vars)
                 .apply(stat)
-                # TODO unclear why next step is needed, x/y end up in frame AND index
-                # .drop(stat_grouping_vars, axis=1, errors="ignore")
+                # TODO next because of https://github.com/pandas-dev/pandas/issues/34809
+                .drop(stat_grouping_vars, axis=1, errors="ignore")
                 .reset_index(stat_grouping_vars)
                 .reset_index(drop=True)  # TODO not always needed, can we limit?
             )
 
         # Our statistics happen on the scale we want, but then matplotlib is going
         # to re-handle the scaling, so we need to invert before handing off
         # Note: we don't need to convert back to strings for categories (but we could?)
-        # data = self.invert_scale(data)
+        df = self._unscale_coords(df)
 
         # TODO this might make debugging annoying ... should we create new layer object?
-        layer.data = data
-
-        # Something like this?
-        # TODO pass in split generator that will be the source of ax
-        ax = self._ax
-        facets = self._facets
+        layer.data.frame = df
 
         # TODO the layer.data somehow needs to pick up variables added in Plot.facet()
-        splitgen = self._make_splitgen(grouping_vars, layer, mappings, ax, facets)
+        splitgen = self._make_splitgen(grouping_vars, layer, mappings)
 
         layer.mark._plot(splitgen, mappings)
 
+    def _scale_coords(self, df):
+
+        # TODO any reason to scale the semantics orighere?
+        out_df = df.drop(["x", "y"], axis=1, errors="ignore").copy(deep=False)
+        coord_df = df.filter(regex="[xy]")
+
+        with pd.option_context("mode.use_inf_as_null", True):
+            coord_df = coord_df.dropna()
+
+        if self._ax is not None:
+            self._scale_coords_single(coord_df, out_df, self._ax)
+        else:
+            grouped = coord_df.groupby(self._axes_map, sort=False)
+            for ax, ax_df in grouped:
+                self._scale_coords_single(ax_df, out_df, ax)
+
+        # TODO do we need to handle nas again, e.g. if negative values
+        # went into a log transform?
+        # cf GH2454
+
+        return out_df
+
+    def _scale_coords_single(self, coord_df, out_df, ax):
+
+        # TODO modify out_df in place or return and handle externally?
+
+        # TODO this looped through "yx" in original core ... why?
+        # for var in "yx":
+        #     if var not in coord_df:
+        #        continue
+        for var, col in coord_df.items():
+
+            axis = getattr(ax, f"{var}axis")
+
+            # TODO should happen upstream, in setup_figure(?), but here for now
+            # will need to account for order; we don't have that yet
+            axis.update_units(col)
+
+            # TODO subset categories based on whether specified in order
+            ...
+
+            scaled = axis.get_transform().transform(axis.convert_units(col))
+            out_df.loc[col.index, var] = scaled
+
+    def _unscale_coords(self, df):
+
+        out_df = df.drop(["x", "y"], axis=1, errors="ignore").copy(deep=False)
+        coord_df = df.filter(regex="[xy]")
+
+        if self._ax is not None:
+            self._unscale_coords_single(coord_df, out_df, self._ax)
+        else:
+            # TODO the only reason this structure exists in the forward scale func
+            # is to support unshared categorical axes. I don't think there is any
+            # situation where numeric axes would have different *transforms*.
+            # So we should be able to do this in one step in all cases, once
+            # we are storing information about the scaling centrally.
+            grouped = coord_df.groupby(self._axes_map, sort=False)
+            for ax, ax_df in grouped:
+                self._unscale_coords_single(ax_df, out_df, ax)
+
+        return out_df
+
+    def _unscale_coords_single(self, coord_df, out_df, ax):
+
+        for var, col in coord_df.items():
+
+            axis = getattr(ax, f"{var}axis")
+            inverse_transform = axis.get_transform().inverted()
+            unscaled = inverse_transform.transform(col)
+            out_df.loc[col.index, var] = unscaled
+
     def _make_splitgen(
         self,
         grouping_vars,
         layer,
         mappings,
-        ax,
-        facets,
     ):  # TODO typing
 
         allow_empty = False  # TODO
 
         data = layer.data.frame
+        # TODO join with axes_map to simplify logic below?
+
+        ax = self._ax
+        facets = self._facets
 
         grouping_vars = [var for var in grouping_vars if var in data]
         if grouping_vars:
@@ -375,6 +466,7 @@ def splitgen() -> Generator[dict[str, Any], DataFrame, Axes]:
 
                 sub_vars = dict(zip(grouping_vars, key))
 
+                # TODO can we use axes_map here?
                 row = sub_vars.get("row", None)
                 col = sub_vars.get("col", None)
                 if row is not None and col is not None: