Skip to content

Commit

Permalink
Some more name and type shedding
Browse files Browse the repository at this point in the history
  • Loading branch information
mwaskom committed May 20, 2021
1 parent 8b1202b commit 718a191
Showing 1 changed file with 56 additions and 56 deletions.
112 changes: 56 additions & 56 deletions seaborn/_new_core.py
Original file line number Diff line number Diff line change
@@ -1,53 +1,54 @@
from __future__ import annotations
from typing import Union, Optional
from typing import Union, Optional, List, Dict, Tuple
from collections.abc import Hashable, Sequence, Mapping, Sized
from numbers import Number

import numpy as np
from numpy import ndarray
import pandas as pd
from pandas import DataFrame, Series
from pandas import DataFrame, Series, Index


# TODO ndarray can be the numpy ArrayLike on 1.20+ (?)
Vector = Union[Series, ndarray, Sequence, Number]
# TODO pandas typing (from data-science-types?) doesn't like Number
Vector = Union[Series, Index, ndarray, Sequence, Number]

# TODO Should we define a DataFrame-like type that is Union[DataFrame, Mapping]?


class Plot:

data: DataSet # TODO possibly should be private?
layers: list[Layer] # TODO probably should be private?
data: PlotData # TODO possibly should be private?
layers: List[Layer] # TODO probably should be private?

def __init__(
self,
data: Optional[Union[DataFrame, Mapping]] = None,
**variables: Union[Hashable, Vector],
**variables: Optional[Union[Hashable, Vector]],
):

# Note that we can't assume wide-form here if variables does not contain x or y
# because those might get assigned in long-form fashion per layer.

self.data = DataSet(data, variables)
self.data = PlotData(data, variables)
self.layers = []

def add(
self,
mark: Mark,
stat: Stat = None,
data: Union[DataFrame, Mapping, None] = None,
variables: dict[str, Union[Hashable, Vector, None]] = None,
data: Optional[Union[DataFrame, Mapping]] = None,
variables: Optional[Dict[str, Optional[Union[Hashable, Vector]]]] = None,
) -> Plot:

# TODO what if in wide-form mode, we convert to long-form
# based on the transform that mark defines?
data = self.data.update(data, variables)
layer_data = self.data.update(data, variables)

if stat is None:
stat = mark.default_stat

self.layers.append(Layer(data, mark, stat))
self.layers.append(Layer(layer_data, mark, stat))

return self

Expand Down Expand Up @@ -114,7 +115,7 @@ def _repr_html_(self) -> str:
# stat?)


class DataSet: # TODO better name?
class PlotData: # TODO better name?

# How to handle wide-form data here, when the dimensional semantics are defined by
# the mark? (I guess? that will be most consistent with how it currently works.)
Expand All @@ -132,13 +133,13 @@ class DataSet: # TODO better name?
# Who owns the existing VectorPlotter.variables, VectorPlotter.var_levels, etc.?

frame: DataFrame
names: dict[str, Optional[str]]
names: Dict[str, Optional[str]]
_source: Optional[Union[DataFrame, Mapping]]

def __init__(
self,
data: Optional[Union[DataFrame, Mapping]],
variables: Optional[dict[str, Union[str, Vector]]],
variables: Optional[Dict[str, Union[Hashable, Vector]]],
# TODO pass in wide semantics?
):

Expand All @@ -148,52 +149,46 @@ def __init__(
# TODO only specing out with long-form data for now...
frame, names = self._assign_variables_longform(data, variables)

self.orig = data
self.frame = frame
self.names = names
self._source = data

def update(
self,
data: Optional[Union[DataFrame, Mapping]],
variables: Optional[dict[str, Union[str, Vector]]],
) -> DataSet:
variables: Optional[Dict[str, Union[Hashable, Vector]]],
) -> PlotData:

# TODO If the idea is that the layer-specific data source will be on the
# left, then we can essentially do an update with other's plot_data and
# var names. But that is tricky because each layer will rarely (probably)
# define x/y, meaning that naively we would think we have wide-form data
# TODO name-wise, does update imply an inplace operation in a confusing way?

# TODO also, we want to be able to disable global semantics by passing None
# here, which probably needs special handling as currently we'll end up
# with a column of Nones which is otherwise gonna get us in trouble...
# TODO Note a tricky thing here which is that often x/y will be inherited
# meaning that the variable specification here will look like "wide-form"

# Inherit the original source of the data
# Inherit the original source of the upsteam data by default
if data is None:
data = self._source

if variables is None:
variables = {}

# Passing var=None implies that we do not want that variable in this layer
drop_cols = []
new_variables = {}
for key, val in variables.items():
if val is None:
drop_cols.append(key)
else:
new_variables[key] = val
disinherit = [k for k, v in variables.items() if v is None]

# Create a new dataset with just the info passed here
new = DataSet(data, new_variables)
new = PlotData(data, variables)

# Update the inherited DataFrame and names with this new information
names = {k: v for k, v in self.names.items() if k not in drop_cols}
names.update(new.names)
# -- Update the inherited DataFrame and names with this new information

drop_cols += new.frame.columns.to_list()
drop_cols = [k for k in self.frame if k in new.frame or k in disinherit]
frame = (
self.frame
.drop(drop_cols, axis=1, errors="ignore")
.join(new.frame, how="left")
.drop(columns=drop_cols)
.join(new.frame) # type: ignore # thinks frame.join is a Series??
)

names = {k: v for k, v in self.names.items() if k not in disinherit}
names.update(new.names)

new.frame = frame
new.names = names

Expand All @@ -202,8 +197,8 @@ def update(
def _assign_variables_longform(
self,
data: Optional[Union[DataFrame, Mapping]],
variables: dict[str, Union[str, Vector]]
) -> tuple[DataFrame, dict[str, Optional[str]]]:
variables: Dict[str, Optional[Union[Hashable, Vector]]]
) -> Tuple[DataFrame, Dict[str, Optional[str]]]:
"""Define plot variables given long-form data and/or vector inputs.
Parameters
Expand All @@ -230,8 +225,8 @@ def _assign_variables_longform(
When variables are strings that don't appear in ``data``.
"""
plot_data = {}
var_names = {}
plot_data: Dict[str, Vector] = {}
var_names: Dict[str, Optional[str]] = {}

# Data is optional; all variables can be defined as vectors
if data is None:
Expand All @@ -246,22 +241,26 @@ def _assign_variables_longform(
# Variables can also be extracted from the index attribute
# TODO is this the most general way to enable it?
# There is no index.to_dict on multiindex, unfortunately
try:
index = data.index.to_frame()
except AttributeError:
if hasattr(data, "index"):
index = data.index.to_frame() # type: ignore # mypy/1424
else:
index = {}

for key, val in variables.items():

# First try to treat the argument as a key for the data collection.
# Simply ignore variables with no specification
if val is None:
continue

# Try to treat the argument as a key for the data collection.
# But be flexible about what can be used as a key.
# Usually it will be a string, but allow numbers or tuples too when
# taking from the main data object. Only allow strings to reference
# fields in the index, because otherwise there is too much ambiguity.
try:
val_as_data_key = (
val in data
or (isinstance(val, (str, bytes)) and val in index)
or (isinstance(val, str) and val in index)
)
except (KeyError, TypeError):
val_as_data_key = False
Expand All @@ -271,9 +270,9 @@ def _assign_variables_longform(
# We know that __getitem__ will work

if val in data:
plot_data[key] = data[val]
plot_data[key] = data[val] # type: ignore # fails on key: Hashable
elif val in index:
plot_data[key] = index[val]
plot_data[key] = index[val] # type: ignore # fails on key: Hashable
var_names[key] = str(val)

elif isinstance(val, str):
Expand All @@ -298,20 +297,21 @@ def _assign_variables_longform(
)
raise ValueError(err)

plot_data[key] = val
plot_data[key] = val # type: ignore # fails on key: Hashable

# Try to infer the name of the variable
var_names[key] = getattr(val, "name", None)

# Construct a tidy plot DataFrame. This will convert a number of
# types automatically, aligning on index in case of pandas objects
frame = pd.DataFrame(plot_data)
frame = pd.DataFrame(plot_data) # type: ignore # should allow dict[str, Number]

# Reduce the variables dictionary to fields with valid data
names = {
names: Dict[str, Optional[str]] = {
var: name
for var, name in var_names.items()
if plot_data[var].notnull().any()
# TODO I am not sure that this is necessary any more
if frame[var].notnull().any()
}

return frame, names
Expand All @@ -325,7 +325,7 @@ class Stat:
class Mark:

# TODO will subclasses overwrite this? Should this be defined elsewhere?
group_vars: list[str] = ["col", "row", "group"]
group_vars: List[str] = ["col", "row", "group"]

default_stat: Optional[Stat] = None # TODO or identity?

Expand Down Expand Up @@ -387,7 +387,7 @@ class Layer:
# Does this need to be anything other than a simple container for these attributes?
# Could use a Dataclass I guess?

def __init__(self, data: DataSet, mark: Mark, stat: Stat = None):
def __init__(self, data: PlotData, mark: Mark, stat: Stat = None):

self.data = data
self.mark = mark
Expand Down

0 comments on commit 718a191

Please sign in to comment.