From 39df303842cb028daae4c2444be50897afe5be75 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Michal=20Bel=C3=A1k?= Date: Wed, 6 Sep 2023 11:03:19 +0200 Subject: [PATCH] feat!: Remove `utils.is_categorical` (#108) Resolves #25 BREAKING CHANGE: function `edvart.utils.is_categorical` is removed. `edvart.data_types.is_categorical` can be used instead, with similar behavior. BREAKING CHANGE: Slightly changes behavior of column selection for some plots, e.g. parallel categories is now stricter in choosing which columns are considered as categorical. --- edvart/plots.py | 7 +++--- .../report_sections/multivariate_analysis.py | 18 ++++++-------- .../time_series_line_plot.py | 5 ++-- edvart/utils.py | 24 ------------------- tests/test_multivariate_analysis.py | 10 ++++---- tests/test_utils.py | 1 - 6 files changed, 17 insertions(+), 48 deletions(-) diff --git a/edvart/plots.py b/edvart/plots.py index 30d2679..b465fff 100644 --- a/edvart/plots.py +++ b/edvart/plots.py @@ -7,8 +7,7 @@ import pandas as pd import plotly.graph_objs as go -from edvart import utils -from edvart.data_types import is_numeric +from edvart.data_types import is_categorical, is_numeric # Multiplier which makes plotly interactive plots (size in pixels) and # matplotlib plots (size in inches) about the same size @@ -101,7 +100,7 @@ def _scatter_plot_2d_noninteractive( ) -> None: _fig, ax = plt.subplots(figsize=figsize) if color_col is not None: - is_color_categorical = utils.is_categorical(df[color_col]) or not is_numeric(df[color_col]) + is_color_categorical = is_categorical(df[color_col]) or not is_numeric(df[color_col]) if is_color_categorical: color_categorical = pd.Categorical(df[color_col]) color_codes = color_categorical.codes @@ -163,7 +162,7 @@ def _scatter_plot_2d_interactive( layout.yaxis.scaleratio = 1 fig = go.Figure(layout=layout) if color_col is not None: - is_color_categorical = utils.is_categorical(df[color_col]) or not is_numeric(df[color_col]) + is_color_categorical = is_categorical(df[color_col]) or not is_numeric(df[color_col]) if is_color_categorical: df = df.copy() x_name, y_name = "__edvart_scatter_x", "__edvart_scatter_y" diff --git a/edvart/report_sections/multivariate_analysis.py b/edvart/report_sections/multivariate_analysis.py index a87f662..e576f7b 100644 --- a/edvart/report_sections/multivariate_analysis.py +++ b/edvart/report_sections/multivariate_analysis.py @@ -12,11 +12,11 @@ from IPython.display import Markdown, display from sklearn.preprocessing import StandardScaler -from edvart.data_types import is_numeric +from edvart.data_types import is_boolean, is_categorical, is_numeric from edvart.plots import scatter_plot_2d from edvart.report_sections.code_string_formatting import get_code, total_dedent from edvart.report_sections.section_base import ReportSection, Section, Verbosity -from edvart.utils import discrete_colorscale, is_categorical +from edvart.utils import discrete_colorscale try: from edvart.report_sections.umap import UMAP @@ -533,11 +533,9 @@ def __init__( columns = [ col for col in df.columns - if is_numeric(df[col]) - or ( - is_categorical(df[col], nunique_max=nunique_max) - and df[col].nunique() <= nunique_max - ) + if is_categorical(df[col], unique_value_count_threshold=nunique_max) + or is_boolean(df[col]) + or is_numeric(df[col]) ] # If all columns are numeric we don't want to list them all in the generated call # Setting columns to None will result in the columns argument not being included @@ -740,10 +738,8 @@ def __init__( columns = [ col for col in df.columns - if ( - is_categorical(df[col], nunique_max=nunique_max) - and df[col].nunique() <= nunique_max - ) + if is_categorical(df[col], unique_value_count_threshold=nunique_max) + or is_boolean(df[col]) ] # If all columns are numeric we don't want to list them all in the generated call diff --git a/edvart/report_sections/timeseries_analysis/time_series_line_plot.py b/edvart/report_sections/timeseries_analysis/time_series_line_plot.py index 6c38f5e..8d9eb24 100644 --- a/edvart/report_sections/timeseries_analysis/time_series_line_plot.py +++ b/edvart/report_sections/timeseries_analysis/time_series_line_plot.py @@ -8,8 +8,7 @@ import plotly.graph_objects as go from IPython.display import Markdown, display -from edvart import utils -from edvart.data_types import is_numeric +from edvart.data_types import is_categorical, is_numeric from edvart.decorators import check_index_time_ascending from edvart.report_sections.code_string_formatting import get_code, total_dedent from edvart.report_sections.section_base import Section, Verbosity @@ -115,7 +114,7 @@ def _time_series_line_plot_colored(df, columns=None, color_col=None): ) layout = dict(xaxis_rangeslider_visible=True) - if not utils.is_categorical(df[color_col]): + if not is_categorical(df[color_col]): raise ValueError(f"Cannot color by non-categorical column `{color_col}`") if df[color_col].nunique() > 20: warnings.warn("Coloring by categorical column with many unique values!") diff --git a/edvart/utils.py b/edvart/utils.py index 27f6d4f..588fa0e 100755 --- a/edvart/utils.py +++ b/edvart/utils.py @@ -8,30 +8,6 @@ import seaborn as sns import statsmodels.api as sm -from edvart.data_types import is_numeric - - -def is_categorical(series: pd.Series, nunique_max: int = 20) -> bool: - """ - A heuristic of whether a series is categorical or numerical. - - Parameters - ---------- - series: pd.Series - Input series - nunique_max: int (default = 20) - Maximum number of unique values for a numeric series to be regarded as categorical. - No limit on number of unique values if set to a negative number. - - Returns - ------- - bool - True if series contains categorical values, otherwise False - """ - return ( - (nunique_max < 0 or series.nunique() <= nunique_max) or not is_numeric(series) - ) and not pd.core.dtypes.common.is_datetime_or_timedelta_dtype(series) - def top_frequent_values(series: pd.Series, n_top: int = 10) -> Dict[Any, float]: """ diff --git a/tests/test_multivariate_analysis.py b/tests/test_multivariate_analysis.py index ed964ca..db24707 100644 --- a/tests/test_multivariate_analysis.py +++ b/tests/test_multivariate_analysis.py @@ -197,7 +197,7 @@ def test_code_export_verbosity_medium_all_cols_valid(): expected_code = [ "pca_first_vs_second(df=df)", "pca_explained_variance(df=df)", - "parallel_categories(df=df)", + "parallel_categories(df=df, columns=['col2'])", ] assert len(exported_code) == len(expected_code) @@ -228,14 +228,14 @@ def test_generated_code_verbosity_1(): )""" ), "parallel_coordinates(df=df)", - "parallel_categories(df=df)", + "parallel_categories(df=df, columns=['B'])", ] else: expected_code = [ "pca_first_vs_second(df=df, columns=['A', 'C', 'D'])", "pca_explained_variance(df=df, columns=['A', 'C', 'D'])", "parallel_coordinates(df=df)", - "parallel_categories(df=df)", + "parallel_categories(df=df, columns=['B'])", ] assert len(exported_code) == len(expected_code) @@ -275,7 +275,7 @@ def test_generated_code_verbosity_2(): ( get_code(utils.discrete_colorscale), get_code(multivariate_analysis.ParallelCategories.parallel_categories), - "parallel_categories(df=df)", + "parallel_categories(df=df, columns=['B'])", ) ), ] @@ -352,7 +352,7 @@ def test_verbosity_low_different_subsection_verbosities(): expected_subsections_str = ", ".join(expected_subsections) expected_code = [ "multivariate_analysis(df=df, " f"subsections=[{expected_subsections_str}])", - "parallel_categories(df=df)", + "parallel_categories(df=df, columns=['B'])", "\n\n".join( ( get_code(utils.discrete_colorscale), diff --git a/tests/test_utils.py b/tests/test_utils.py index 0c92979..7eb4856 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -29,7 +29,6 @@ def test_full_na_series(): warnings.simplefilter(action="error", category=RuntimeWarning) result = func(series) assert math.isnan(float(result)) - assert utils.is_categorical(series) assert utils.num_unique_values(series) == 0