Skip to content

Commit

Permalink
Working DataSet.update logic
Browse files Browse the repository at this point in the history
  • Loading branch information
mwaskom committed May 20, 2021
1 parent 4ca5964 commit 8b1202b
Showing 1 changed file with 70 additions and 39 deletions.
109 changes: 70 additions & 39 deletions seaborn/_new_core.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from __future__ import annotations
from typing import Union, Optional
from collections.abc import Hashable, Sequence, Mapping
from collections.abc import Hashable, Sequence, Mapping, Sized
from numbers import Number

import numpy as np
Expand All @@ -17,6 +17,9 @@

class Plot:

data: DataSet # TODO possibly should be private?
layers: list[Layer] # TODO probably should be private?

def __init__(
self,
data: Optional[Union[DataFrame, Mapping]] = None,
Expand All @@ -26,8 +29,8 @@ def __init__(
# Note that we can't assume wide-form here if variables does not contain x or y
# because those might get assigned in long-form fashion per layer.

self.data = DataSource(data, variables)
self.layers: list[Layer] = []
self.data = DataSet(data, variables)
self.layers = []

def add(
self,
Expand All @@ -39,7 +42,7 @@ def add(

# TODO what if in wide-form mode, we convert to long-form
# based on the transform that mark defines?
data = DataSource(data, variables).join(self.data)
data = self.data.update(data, variables)

if stat is None:
stat = mark.default_stat
Expand All @@ -51,6 +54,10 @@ def add(
# TODO problem with "draw" meaning something specific in mpl?
def draw(self) -> Plot:

# TODO one option is to loop over the layers here and use them to
# initialize and scaling/mapping we need to do (using parameters)
# possibly previously set and stored through calls to map_hue etc.

# TODO or something like this
for layer in self.layers:
self._draw_layer(layer)
Expand Down Expand Up @@ -107,7 +114,7 @@ def _repr_html_(self) -> str:
# stat?)


class DataSource:
class DataSet: # TODO better name?

# How to handle wide-form data here, when the dimensional semantics are defined by
# the mark? (I guess? that will be most consistent with how it currently works.)
Expand All @@ -124,54 +131,79 @@ class DataSource:

# Who owns the existing VectorPlotter.variables, VectorPlotter.var_levels, etc.?

frame: DataFrame
names: dict[str, Optional[str]]
_source: Optional[Union[DataFrame, Mapping]]

def __init__(
self,
data: Union[DataFrame, Mapping, None],
variables: dict[str, Union[str, Vector]],
data: Optional[Union[DataFrame, Mapping]],
variables: Optional[dict[str, Union[str, Vector]]],
# TODO pass in wide semantics?
):

# TODO only specing out with long-form data for now...
data, names = self._assign_variables_longform(data, variables)
if variables is None:
variables = {}

self._data = data
self._names = names
# TODO only specing out with long-form data for now...
frame, names = self._assign_variables_longform(data, variables)

def join(self, other: DataSource) -> None: # TODO return self?
self.orig = data
self.frame = frame
self.names = names

# TODO Define this as a left join I guess
def update(
self,
data: Optional[Union[DataFrame, Mapping]],
variables: Optional[dict[str, Union[str, Vector]]],
) -> DataSet:

# TODO If the idea is that the layer-specific data source will be on the
# left, then we can essentially do an update with other's plot_data and
# var names. But that is tricky because each layer will rarely (probably)
# define x/y, meaning that naively we would think we have wide-form data

# TODO also we need to account for when we just get names at the layer
# and they reference fields in the original data. Maybe the signature
# here should chance to data, variables? (But that would reverse the
# assumed order of what is self and what is other)

# TODO also, we want to be able to disable global semantics by passing None
# here, which probably needs special handling as currently we'll end up
# with a column of Nones which is otherwise gonna get us in trouble...

data = (
self._data
.drop(other._data.columns, axis=1)
.join(other._data, how="left", copy=False) # TODO check copy
# Inherit the original source of the data
if data is None:
data = self._source

# Passing var=None implies that we do not want that variable in this layer
drop_cols = []
new_variables = {}
for key, val in variables.items():
if val is None:
drop_cols.append(key)
else:
new_variables[key] = val

# Create a new dataset with just the info passed here
new = DataSet(data, new_variables)

# Update the inherited DataFrame and names with this new information
names = {k: v for k, v in self.names.items() if k not in drop_cols}
names.update(new.names)

drop_cols += new.frame.columns.to_list()
frame = (
self.frame
.drop(drop_cols, axis=1, errors="ignore")
.join(new.frame, how="left")
)

names = self._names.copy()
names.update(other._names)
new.frame = frame
new.names = names

self._data = data
self._names = names
return new

def _assign_variables_longform(
self,
data: Union[DataFrame, Mapping, None],
data: Optional[Union[DataFrame, Mapping]],
variables: dict[str, Union[str, Vector]]
) -> tuple[DataFrame, dict[str, Union[str, None]]]:
) -> tuple[DataFrame, dict[str, Optional[str]]]:
"""Define plot variables given long-form data and/or vector inputs.
Parameters
Expand All @@ -185,10 +217,10 @@ def _assign_variables_longform(
Returns
-------
plot_data : :class:`pandas.DataFrame`
frame
Long-form data object mapping seaborn variables (x, y, hue, ...)
to data vectors.
variables : dict
names
Keys are defined seaborn variables; values are names inferred from
the inputs (or None when no name can be determined).
Expand All @@ -211,15 +243,14 @@ def _assign_variables_longform(
# want, whereas DataFrame.to_dict() gives a nested dict instead of
# a dict of series.

# Variables can also be extraced from the index attribute
# Variables can also be extracted from the index attribute
# TODO is this the most general way to enable it?
# There is no index.to_dict on multiindex, unfortunately
try:
index = data.index.to_frame()
except AttributeError:
index = {}

# The caller will determine the order of variables in plot_data
for key, val in variables.items():

# First try to treat the argument as a key for the data collection.
Expand All @@ -243,9 +274,9 @@ def _assign_variables_longform(
plot_data[key] = data[val]
elif val in index:
plot_data[key] = index[val]
var_names[key] = val
var_names[key] = str(val)

elif isinstance(val, (str, bytes)):
elif isinstance(val, str):

# This looks like a column name but we don't know what it means!

Expand All @@ -258,7 +289,7 @@ def _assign_variables_longform(

# Raise when data object is present and a vector can't matched
if isinstance(data, pd.DataFrame) and not isinstance(val, pd.Series):
if np.ndim(val) and len(data) != len(val):
if isinstance(val, Sized) and len(data) != len(val):
val_cls = val.__class__.__name__
err = (
f"Length of {val_cls} vectors must match length of `data`"
Expand All @@ -274,16 +305,16 @@ def _assign_variables_longform(

# Construct a tidy plot DataFrame. This will convert a number of
# types automatically, aligning on index in case of pandas objects
plot_data = pd.DataFrame(plot_data)
frame = pd.DataFrame(plot_data)

# Reduce the variables dictionary to fields with valid data
var_names = {
names = {
var: name
for var, name in var_names.items()
if plot_data[var].notnull().any()
}

return plot_data, var_names
return frame, names


class Stat:
Expand Down Expand Up @@ -356,7 +387,7 @@ class Layer:
# Does this need to be anything other than a simple container for these attributes?
# Could use a Dataclass I guess?

def __init__(self, data: DataSource, mark: Mark, stat: Stat = None):
def __init__(self, data: DataSet, mark: Mark, stat: Stat = None):

self.data = data
self.mark = mark
Expand Down

0 comments on commit 8b1202b

Please sign in to comment.