diff --git a/seaborn/_core/data.py b/seaborn/_core/data.py index 6c1cf87217..6fe0a0a5b5 100644 --- a/seaborn/_core/data.py +++ b/seaborn/_core/data.py @@ -132,10 +132,11 @@ def _assign_variables( names: dict[str, str | None] plot_data = {} - var_names = {} + names = {} # Data is optional; all variables can be defined as vectors - if data is None: + data_not_passed = data is None + if data_not_passed: data = {} # TODO Generally interested in accepting a generic DataFrame interface @@ -172,23 +173,28 @@ def _assign_variables( plot_data[key] = data[val] elif val in index: plot_data[key] = index[val] - var_names[key] = str(val) + names[key] = str(val) elif isinstance(val, str): - # This looks like a column name but we don't know what it means! - # TODO improve this feedback to distinguish between - # - "you passed a string, but did not pass data" - # - "you passed a string, it was not found in data" + # This looks like a column name but, lookup failed. - err = f"Could not interpret value `{val}` for parameter `{key}`" + err = f"Could not interpret value `{val}` for `{key}`. " + if data_not_passed: + err += "Value is a string, but `data` was not passed." + else: + err += "An entry with this name does not appear in `data`." raise ValueError(err) else: - # Otherwise, assume the value is itself data + # Otherwise, assume the value somehow represents data + + # Ignore empty data structures + if isinstance(val, abc.Sized) and len(val) == 0: + continue - # Raise when data object is present and a vector can't matched + # If vector has no index, it must match length of data table if isinstance(data, pd.DataFrame) and not isinstance(val, pd.Series): if isinstance(val, abc.Sized) and len(data) != len(val): val_cls = val.__class__.__name__ @@ -201,19 +207,14 @@ def _assign_variables( plot_data[key] = val - # Try to infer the name of the variable - var_names[key] = getattr(val, "name", None) + # Try to infer the original name using pandas-like metadata + if hasattr(val, "name"): + names[key] = str(val.name) + else: + names[key] = None # Construct a tidy plot DataFrame. This will convert a number of # types automatically, aligning on index in case of pandas objects frame = pd.DataFrame(plot_data) - # Build the mapping from plot variables to original names (if available) - names = { - var: name - for var, name in var_names.items() - # TODO I am not sure that this is necessary any more - if frame[var].notnull().any() - } - return frame, names diff --git a/seaborn/tests/_core/test_data.py b/seaborn/tests/_core/test_data.py index 7ba4c0de54..4c89a16924 100644 --- a/seaborn/tests/_core/test_data.py +++ b/seaborn/tests/_core/test_data.py @@ -190,7 +190,7 @@ def test_key_not_in_data_raises(self, long_df): var = "x" key = "what" - msg = f"Could not interpret value `{key}` for parameter `{var}`" + msg = f"Could not interpret value `{key}` for `{var}`. An entry with this name" with pytest.raises(ValueError, match=msg): PlotData(long_df, {var: key}) @@ -198,9 +198,9 @@ def test_key_with_no_data_raises(self): var = "x" key = "what" - msg = f"Could not interpret value `{key}` for parameter `{var}`" + msg = f"Could not interpret value `{key}` for `{var}`. Value is a string," with pytest.raises(ValueError, match=msg): - PlotData(variables={var: key}) + PlotData(None, {var: key}) def test_data_vector_different_lengths_raises(self, long_df):