Skip to content

Commit

Permalink
First steps towards implementing DataSource, with issues
Browse files Browse the repository at this point in the history
  • Loading branch information
mwaskom committed May 19, 2021
1 parent ecd5773 commit 4ca5964
Showing 1 changed file with 208 additions and 12 deletions.
220 changes: 208 additions & 12 deletions seaborn/_new_core.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
from pandas import DataFrame, Series


# TODO ndarray can be the numpy ArrayLike on 1.20+ (?)
Vector = Union[Series, ndarray, Sequence, Number]

# TODO Should we define a DataFrame-like type that is Union[DataFrame, Mapping]?
Expand Down Expand Up @@ -47,15 +48,16 @@ def add(

return self

def plot(self) -> Plot:
# TODO problem with "draw" meaning something specific in mpl?
def draw(self) -> Plot:

# TODO or something like this
for layer in self.layers:
self._plot_layer(layer)
self._draw_layer(layer)

return self

def _plot_layer(self, layer):
def _draw_layer(self, layer):

# Roughly ...

Expand All @@ -71,15 +73,15 @@ def _plot_layer(self, layer):
data = self.invert_scale(data)

# Something like this?
layer.mark.plot(data) # do we pass ax (and/or facets?! here?)
layer.mark._draw(data) # do we pass ax (and/or facets?! here?)

def show(self) -> Plot:

# TODO guard this here?
# We could have the option to be totally pyplot free
# in which case this method would raise
import matplotlib.pyplot as plt
self.plot()
self.draw()
plt.show()

return self
Expand Down Expand Up @@ -122,13 +124,166 @@ class DataSource:

# Who owns the existing VectorPlotter.variables, VectorPlotter.var_levels, etc.?

def __init__(self, data, variables):
def __init__(
self,
data: Union[DataFrame, Mapping, None],
variables: dict[str, Union[str, Vector]],
# TODO pass in wide semantics?
):

# TODO only specing out with long-form data for now...
data, names = self._assign_variables_longform(data, variables)

self._data = data
self._names = names

def join(self, other: DataSource) -> None: # TODO return self?

pass
# TODO Define this as a left join I guess

def join(self, other: DataSource):
# TODO If the idea is that the layer-specific data source will be on the
# left, then we can essentially do an update with other's plot_data and
# var names. But that is tricky because each layer will rarely (probably)
# define x/y, meaning that naively we would think we have wide-form data

pass
# TODO also we need to account for when we just get names at the layer
# and they reference fields in the original data. Maybe the signature
# here should chance to data, variables? (But that would reverse the
# assumed order of what is self and what is other)

# TODO also, we want to be able to disable global semantics by passing None
# here, which probably needs special handling as currently we'll end up
# with a column of Nones which is otherwise gonna get us in trouble...

data = (
self._data
.drop(other._data.columns, axis=1)
.join(other._data, how="left", copy=False) # TODO check copy
)

names = self._names.copy()
names.update(other._names)

self._data = data
self._names = names

def _assign_variables_longform(
self,
data: Union[DataFrame, Mapping, None],
variables: dict[str, Union[str, Vector]]
) -> tuple[DataFrame, dict[str, Union[str, None]]]:
"""Define plot variables given long-form data and/or vector inputs.
Parameters
----------
data
Input data where variable names map to vector values.
variables
Keys are seaborn variables (x, y, hue, ...) and values are vectors
in any format that can construct a :class:`pandas.DataFrame` or
names of columns or index levels in ``data``.
Returns
-------
plot_data : :class:`pandas.DataFrame`
Long-form data object mapping seaborn variables (x, y, hue, ...)
to data vectors.
variables : dict
Keys are defined seaborn variables; values are names inferred from
the inputs (or None when no name can be determined).
Raises
------
ValueError
When variables are strings that don't appear in ``data``.
"""
plot_data = {}
var_names = {}

# Data is optional; all variables can be defined as vectors
if data is None:
data = {}

# TODO should we try a data.to_dict() or similar here to more
# generally accept objects with that interface?
# Note that dict(df) also works for pandas, and gives us what we
# want, whereas DataFrame.to_dict() gives a nested dict instead of
# a dict of series.

# Variables can also be extraced from the index attribute
# TODO is this the most general way to enable it?
# There is no index.to_dict on multiindex, unfortunately
try:
index = data.index.to_frame()
except AttributeError:
index = {}

# The caller will determine the order of variables in plot_data
for key, val in variables.items():

# First try to treat the argument as a key for the data collection.
# But be flexible about what can be used as a key.
# Usually it will be a string, but allow numbers or tuples too when
# taking from the main data object. Only allow strings to reference
# fields in the index, because otherwise there is too much ambiguity.
try:
val_as_data_key = (
val in data
or (isinstance(val, (str, bytes)) and val in index)
)
except (KeyError, TypeError):
val_as_data_key = False

if val_as_data_key:

# We know that __getitem__ will work

if val in data:
plot_data[key] = data[val]
elif val in index:
plot_data[key] = index[val]
var_names[key] = val

elif isinstance(val, (str, bytes)):

# This looks like a column name but we don't know what it means!

err = f"Could not interpret value `{val}` for parameter `{key}`"
raise ValueError(err)

else:

# Otherwise, assume the value is itself data

# Raise when data object is present and a vector can't matched
if isinstance(data, pd.DataFrame) and not isinstance(val, pd.Series):
if np.ndim(val) and len(data) != len(val):
val_cls = val.__class__.__name__
err = (
f"Length of {val_cls} vectors must match length of `data`"
f" when both are used, but `data` has length {len(data)}"
f" and the vector passed to `{key}` has length {len(val)}."
)
raise ValueError(err)

plot_data[key] = val

# Try to infer the name of the variable
var_names[key] = getattr(val, "name", None)

# Construct a tidy plot DataFrame. This will convert a number of
# types automatically, aligning on index in case of pandas objects
plot_data = pd.DataFrame(plot_data)

# Reduce the variables dictionary to fields with valid data
var_names = {
var: name
for var, name in var_names.items()
if plot_data[var].notnull().any()
}

return plot_data, var_names


class Stat:
Expand All @@ -148,11 +303,52 @@ class Mark:

class Point(Mark):

def _plot(self, data_gen): # TODO data_gen is maybe too restrictive a name?
def __init__(self, **kwargs):

self.kwargs = kwargs

def _draw(self, plot): # TODO data_gen is maybe too restrictive a name?

kws = self.kwargs.copy()

for keys, data, ax in plot.data_gen(self.group_vars):

# Define the vectors of x and y positions
empty = np.full(len(data), np.nan)
x = data.get("x", empty)
y = data.get("y", empty)

# Set defaults for other visual attributes
kws.setdefault("edgecolor", "w")

if "style" in data:
# Use a representative marker so scatter sets the edgecolor
# properly for line art markers. We currently enforce either
# all or none line art so this works.
example_level = self._style_map.levels[0]
example_marker = self._style_map(example_level, "marker")
kws.setdefault("marker", example_marker)

# TODO this makes it impossible to vary alpha with hue which might
# otherwise be useful? Should we just pass None?
kws["alpha"] = 1 if self.alpha == "auto" else self.alpha

# Draw the scatter plot
points = ax.scatter(x=x, y=y, **kws)

# Apply the mapping from semantic variables to artist attributes

if "hue" in self.variables:
points.set_facecolors(self._hue_map(data["hue"]))

if "size" in self.variables:
points.set_sizes(self._size_map(data["size"]))

for keys, data, ax in data_gen(self.group_vars):
if "style" in self.variables:
p = [self._style_map(val, "path") for val in data["style"]]
points.set_paths(p)

pass
# Apply dependant default attributes


class Layer:
Expand Down

0 comments on commit 4ca5964

Please sign in to comment.