From 39df303842cb028daae4c2444be50897afe5be75 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Michal=20Bel=C3=A1k?= <michal.belak@datamole.cz>
Date: Wed, 6 Sep 2023 11:03:19 +0200
Subject: [PATCH] feat!: Remove `utils.is_categorical` (#108)

Resolves #25

BREAKING CHANGE: function `edvart.utils.is_categorical` is removed.
`edvart.data_types.is_categorical`
can be used instead, with similar behavior.
BREAKING CHANGE: Slightly changes behavior of column selection for some
plots, e.g. parallel categories is now stricter in choosing which
columns are considered as categorical.
---
 edvart/plots.py                               |  7 +++---
 .../report_sections/multivariate_analysis.py  | 18 ++++++--------
 .../time_series_line_plot.py                  |  5 ++--
 edvart/utils.py                               | 24 -------------------
 tests/test_multivariate_analysis.py           | 10 ++++----
 tests/test_utils.py                           |  1 -
 6 files changed, 17 insertions(+), 48 deletions(-)

diff --git a/edvart/plots.py b/edvart/plots.py
index 30d2679..b465fff 100644
--- a/edvart/plots.py
+++ b/edvart/plots.py
@@ -7,8 +7,7 @@
 import pandas as pd
 import plotly.graph_objs as go
 
-from edvart import utils
-from edvart.data_types import is_numeric
+from edvart.data_types import is_categorical, is_numeric
 
 # Multiplier which makes plotly interactive plots (size in pixels) and
 # matplotlib plots (size in inches) about the same size
@@ -101,7 +100,7 @@ def _scatter_plot_2d_noninteractive(
 ) -> None:
     _fig, ax = plt.subplots(figsize=figsize)
     if color_col is not None:
-        is_color_categorical = utils.is_categorical(df[color_col]) or not is_numeric(df[color_col])
+        is_color_categorical = is_categorical(df[color_col]) or not is_numeric(df[color_col])
         if is_color_categorical:
             color_categorical = pd.Categorical(df[color_col])
             color_codes = color_categorical.codes
@@ -163,7 +162,7 @@ def _scatter_plot_2d_interactive(
         layout.yaxis.scaleratio = 1
     fig = go.Figure(layout=layout)
     if color_col is not None:
-        is_color_categorical = utils.is_categorical(df[color_col]) or not is_numeric(df[color_col])
+        is_color_categorical = is_categorical(df[color_col]) or not is_numeric(df[color_col])
         if is_color_categorical:
             df = df.copy()
             x_name, y_name = "__edvart_scatter_x", "__edvart_scatter_y"
diff --git a/edvart/report_sections/multivariate_analysis.py b/edvart/report_sections/multivariate_analysis.py
index a87f662..e576f7b 100644
--- a/edvart/report_sections/multivariate_analysis.py
+++ b/edvart/report_sections/multivariate_analysis.py
@@ -12,11 +12,11 @@
 from IPython.display import Markdown, display
 from sklearn.preprocessing import StandardScaler
 
-from edvart.data_types import is_numeric
+from edvart.data_types import is_boolean, is_categorical, is_numeric
 from edvart.plots import scatter_plot_2d
 from edvart.report_sections.code_string_formatting import get_code, total_dedent
 from edvart.report_sections.section_base import ReportSection, Section, Verbosity
-from edvart.utils import discrete_colorscale, is_categorical
+from edvart.utils import discrete_colorscale
 
 try:
     from edvart.report_sections.umap import UMAP
@@ -533,11 +533,9 @@ def __init__(
             columns = [
                 col
                 for col in df.columns
-                if is_numeric(df[col])
-                or (
-                    is_categorical(df[col], nunique_max=nunique_max)
-                    and df[col].nunique() <= nunique_max
-                )
+                if is_categorical(df[col], unique_value_count_threshold=nunique_max)
+                or is_boolean(df[col])
+                or is_numeric(df[col])
             ]
             # If all columns are numeric we don't want to list them all in the generated call
             # Setting columns to None will result in the columns argument not being included
@@ -740,10 +738,8 @@ def __init__(
             columns = [
                 col
                 for col in df.columns
-                if (
-                    is_categorical(df[col], nunique_max=nunique_max)
-                    and df[col].nunique() <= nunique_max
-                )
+                if is_categorical(df[col], unique_value_count_threshold=nunique_max)
+                or is_boolean(df[col])
             ]
 
             # If all columns are numeric we don't want to list them all in the generated call
diff --git a/edvart/report_sections/timeseries_analysis/time_series_line_plot.py b/edvart/report_sections/timeseries_analysis/time_series_line_plot.py
index 6c38f5e..8d9eb24 100644
--- a/edvart/report_sections/timeseries_analysis/time_series_line_plot.py
+++ b/edvart/report_sections/timeseries_analysis/time_series_line_plot.py
@@ -8,8 +8,7 @@
 import plotly.graph_objects as go
 from IPython.display import Markdown, display
 
-from edvart import utils
-from edvart.data_types import is_numeric
+from edvart.data_types import is_categorical, is_numeric
 from edvart.decorators import check_index_time_ascending
 from edvart.report_sections.code_string_formatting import get_code, total_dedent
 from edvart.report_sections.section_base import Section, Verbosity
@@ -115,7 +114,7 @@ def _time_series_line_plot_colored(df, columns=None, color_col=None):
                     )
 
         layout = dict(xaxis_rangeslider_visible=True)
-        if not utils.is_categorical(df[color_col]):
+        if not is_categorical(df[color_col]):
             raise ValueError(f"Cannot color by non-categorical column `{color_col}`")
         if df[color_col].nunique() > 20:
             warnings.warn("Coloring by categorical column with many unique values!")
diff --git a/edvart/utils.py b/edvart/utils.py
index 27f6d4f..588fa0e 100755
--- a/edvart/utils.py
+++ b/edvart/utils.py
@@ -8,30 +8,6 @@
 import seaborn as sns
 import statsmodels.api as sm
 
-from edvart.data_types import is_numeric
-
-
-def is_categorical(series: pd.Series, nunique_max: int = 20) -> bool:
-    """
-    A heuristic of whether a series is categorical or numerical.
-
-    Parameters
-    ----------
-    series: pd.Series
-        Input series
-    nunique_max: int (default = 20)
-        Maximum number of unique values for a numeric series to be regarded as categorical.
-        No limit on number of unique values if set to a negative number.
-
-    Returns
-    -------
-    bool
-        True if series contains categorical values, otherwise False
-    """
-    return (
-        (nunique_max < 0 or series.nunique() <= nunique_max) or not is_numeric(series)
-    ) and not pd.core.dtypes.common.is_datetime_or_timedelta_dtype(series)
-
 
 def top_frequent_values(series: pd.Series, n_top: int = 10) -> Dict[Any, float]:
     """
diff --git a/tests/test_multivariate_analysis.py b/tests/test_multivariate_analysis.py
index ed964ca..db24707 100644
--- a/tests/test_multivariate_analysis.py
+++ b/tests/test_multivariate_analysis.py
@@ -197,7 +197,7 @@ def test_code_export_verbosity_medium_all_cols_valid():
     expected_code = [
         "pca_first_vs_second(df=df)",
         "pca_explained_variance(df=df)",
-        "parallel_categories(df=df)",
+        "parallel_categories(df=df, columns=['col2'])",
     ]
 
     assert len(exported_code) == len(expected_code)
@@ -228,14 +228,14 @@ def test_generated_code_verbosity_1():
                 )"""
             ),
             "parallel_coordinates(df=df)",
-            "parallel_categories(df=df)",
+            "parallel_categories(df=df, columns=['B'])",
         ]
     else:
         expected_code = [
             "pca_first_vs_second(df=df, columns=['A', 'C', 'D'])",
             "pca_explained_variance(df=df, columns=['A', 'C', 'D'])",
             "parallel_coordinates(df=df)",
-            "parallel_categories(df=df)",
+            "parallel_categories(df=df, columns=['B'])",
         ]
 
     assert len(exported_code) == len(expected_code)
@@ -275,7 +275,7 @@ def test_generated_code_verbosity_2():
             (
                 get_code(utils.discrete_colorscale),
                 get_code(multivariate_analysis.ParallelCategories.parallel_categories),
-                "parallel_categories(df=df)",
+                "parallel_categories(df=df, columns=['B'])",
             )
         ),
     ]
@@ -352,7 +352,7 @@ def test_verbosity_low_different_subsection_verbosities():
     expected_subsections_str = ", ".join(expected_subsections)
     expected_code = [
         "multivariate_analysis(df=df, " f"subsections=[{expected_subsections_str}])",
-        "parallel_categories(df=df)",
+        "parallel_categories(df=df, columns=['B'])",
         "\n\n".join(
             (
                 get_code(utils.discrete_colorscale),
diff --git a/tests/test_utils.py b/tests/test_utils.py
index 0c92979..7eb4856 100644
--- a/tests/test_utils.py
+++ b/tests/test_utils.py
@@ -29,7 +29,6 @@ def test_full_na_series():
             warnings.simplefilter(action="error", category=RuntimeWarning)
             result = func(series)
             assert math.isnan(float(result))
-    assert utils.is_categorical(series)
     assert utils.num_unique_values(series) == 0