From a5b524b59ad3467eef9922cb48b469cd63123f62 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Michal=20Bel=C3=A1k?= Date: Wed, 30 Aug 2023 14:11:09 +0200 Subject: [PATCH] feat: use `data_types.is_categorical` instead of `utils.is_categorical` Preparatory commit for removal of `utils.is_categorical`. Slightly changes behavior of some plots, e.g. parallel categories is now stricter in choosing which columns are considered as categorical. --- edvart/plots.py | 7 +++---- .../report_sections/multivariate_analysis.py | 20 +++++++++---------- .../time_series_line_plot.py | 5 ++--- tests/test_multivariate_analysis.py | 10 +++++----- 4 files changed, 19 insertions(+), 23 deletions(-) diff --git a/edvart/plots.py b/edvart/plots.py index 30d2679..b465fff 100644 --- a/edvart/plots.py +++ b/edvart/plots.py @@ -7,8 +7,7 @@ import pandas as pd import plotly.graph_objs as go -from edvart import utils -from edvart.data_types import is_numeric +from edvart.data_types import is_categorical, is_numeric # Multiplier which makes plotly interactive plots (size in pixels) and # matplotlib plots (size in inches) about the same size @@ -101,7 +100,7 @@ def _scatter_plot_2d_noninteractive( ) -> None: _fig, ax = plt.subplots(figsize=figsize) if color_col is not None: - is_color_categorical = utils.is_categorical(df[color_col]) or not is_numeric(df[color_col]) + is_color_categorical = is_categorical(df[color_col]) or not is_numeric(df[color_col]) if is_color_categorical: color_categorical = pd.Categorical(df[color_col]) color_codes = color_categorical.codes @@ -163,7 +162,7 @@ def _scatter_plot_2d_interactive( layout.yaxis.scaleratio = 1 fig = go.Figure(layout=layout) if color_col is not None: - is_color_categorical = utils.is_categorical(df[color_col]) or not is_numeric(df[color_col]) + is_color_categorical = is_categorical(df[color_col]) or not is_numeric(df[color_col]) if is_color_categorical: df = df.copy() x_name, y_name = "__edvart_scatter_x", "__edvart_scatter_y" diff --git a/edvart/report_sections/multivariate_analysis.py b/edvart/report_sections/multivariate_analysis.py index a87f662..34ae547 100644 --- a/edvart/report_sections/multivariate_analysis.py +++ b/edvart/report_sections/multivariate_analysis.py @@ -12,11 +12,11 @@ from IPython.display import Markdown, display from sklearn.preprocessing import StandardScaler -from edvart.data_types import is_numeric +from edvart.data_types import is_boolean, is_categorical, is_numeric from edvart.plots import scatter_plot_2d from edvart.report_sections.code_string_formatting import get_code, total_dedent from edvart.report_sections.section_base import ReportSection, Section, Verbosity -from edvart.utils import discrete_colorscale, is_categorical +from edvart.utils import discrete_colorscale try: from edvart.report_sections.umap import UMAP @@ -533,11 +533,9 @@ def __init__( columns = [ col for col in df.columns - if is_numeric(df[col]) - or ( - is_categorical(df[col], nunique_max=nunique_max) - and df[col].nunique() <= nunique_max - ) + if is_categorical(df[col], unique_value_count_threshold=nunique_max) + or is_boolean(df[col]) + or is_numeric(df[col]) ] # If all columns are numeric we don't want to list them all in the generated call # Setting columns to None will result in the columns argument not being included @@ -740,10 +738,10 @@ def __init__( columns = [ col for col in df.columns - if ( - is_categorical(df[col], nunique_max=nunique_max) - and df[col].nunique() <= nunique_max - ) + if is_categorical( + df[col], + unique_value_count_threshold=nunique_max + ) or is_boolean(df[col]) ] # If all columns are numeric we don't want to list them all in the generated call diff --git a/edvart/report_sections/timeseries_analysis/time_series_line_plot.py b/edvart/report_sections/timeseries_analysis/time_series_line_plot.py index e5f8cc9..96ec58e 100644 --- a/edvart/report_sections/timeseries_analysis/time_series_line_plot.py +++ b/edvart/report_sections/timeseries_analysis/time_series_line_plot.py @@ -8,8 +8,7 @@ import plotly.graph_objects as go from IPython.display import Markdown, display -from edvart import utils -from edvart.data_types import is_numeric +from edvart.data_types import is_categorical, is_numeric from edvart.decorators import check_index_time_ascending from edvart.report_sections.code_string_formatting import get_code, total_dedent from edvart.report_sections.section_base import Section, Verbosity @@ -115,7 +114,7 @@ def _time_series_line_plot_colored(df, columns=None, color_col=None): ) layout = dict(xaxis_rangeslider_visible=True) - if not utils.is_categorical(df[color_col]): + if not is_categorical(df[color_col]): raise ValueError(f"Cannot color by non-categorical column `{color_col}`") if df[color_col].nunique() > 20: warnings.warn("Coloring by categorical column with many unique values!") diff --git a/tests/test_multivariate_analysis.py b/tests/test_multivariate_analysis.py index 9aa3dfc..0b190a7 100644 --- a/tests/test_multivariate_analysis.py +++ b/tests/test_multivariate_analysis.py @@ -197,7 +197,7 @@ def test_code_export_verbosity_medium_all_cols_valid(): expected_code = [ "pca_first_vs_second(df=df)", "pca_explained_variance(df=df)", - "parallel_categories(df=df)", + "parallel_categories(df=df, columns=['col2'])", ] assert len(exported_code) == len(expected_code) @@ -228,14 +228,14 @@ def test_generated_code_verobsity_1(): )""" ), "parallel_coordinates(df=df)", - "parallel_categories(df=df)", + "parallel_categories(df=df, columns=['B'])", ] else: expected_code = [ "pca_first_vs_second(df=df, columns=['A', 'C', 'D'])", "pca_explained_variance(df=df, columns=['A', 'C', 'D'])", "parallel_coordinates(df=df)", - "parallel_categories(df=df)", + "parallel_categories(df=df, columns=['B'])", ] assert len(exported_code) == len(expected_code) @@ -275,7 +275,7 @@ def test_generated_code_verobsity_2(): ( get_code(utils.discrete_colorscale), get_code(multivariate_analysis.ParallelCategories.parallel_categories), - "parallel_categories(df=df)", + "parallel_categories(df=df, columns=['B'])", ) ), ] @@ -352,7 +352,7 @@ def test_verbosity_low_different_subsection_verbosities(): expected_subsections_str = ", ".join(expected_subsections) expected_code = [ "multivariate_analysis(df=df, " f"subsections=[{expected_subsections_str}])", - "parallel_categories(df=df)", + "parallel_categories(df=df, columns=['B'])", "\n\n".join( ( get_code(utils.discrete_colorscale),