From a5b524b59ad3467eef9922cb48b469cd63123f62 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Michal=20Bel=C3=A1k?= <michal.belak@datamole.cz>
Date: Wed, 30 Aug 2023 14:11:09 +0200
Subject: [PATCH] feat: use `data_types.is_categorical` instead of
 `utils.is_categorical`

Preparatory commit for removal of `utils.is_categorical`. Slightly
changes behavior of some plots, e.g. parallel categories is now stricter in
choosing which columns are considered as categorical.
---
 edvart/plots.py                               |  7 +++----
 .../report_sections/multivariate_analysis.py  | 20 +++++++++----------
 .../time_series_line_plot.py                  |  5 ++---
 tests/test_multivariate_analysis.py           | 10 +++++-----
 4 files changed, 19 insertions(+), 23 deletions(-)

diff --git a/edvart/plots.py b/edvart/plots.py
index 30d2679..b465fff 100644
--- a/edvart/plots.py
+++ b/edvart/plots.py
@@ -7,8 +7,7 @@
 import pandas as pd
 import plotly.graph_objs as go
 
-from edvart import utils
-from edvart.data_types import is_numeric
+from edvart.data_types import is_categorical, is_numeric
 
 # Multiplier which makes plotly interactive plots (size in pixels) and
 # matplotlib plots (size in inches) about the same size
@@ -101,7 +100,7 @@ def _scatter_plot_2d_noninteractive(
 ) -> None:
     _fig, ax = plt.subplots(figsize=figsize)
     if color_col is not None:
-        is_color_categorical = utils.is_categorical(df[color_col]) or not is_numeric(df[color_col])
+        is_color_categorical = is_categorical(df[color_col]) or not is_numeric(df[color_col])
         if is_color_categorical:
             color_categorical = pd.Categorical(df[color_col])
             color_codes = color_categorical.codes
@@ -163,7 +162,7 @@ def _scatter_plot_2d_interactive(
         layout.yaxis.scaleratio = 1
     fig = go.Figure(layout=layout)
     if color_col is not None:
-        is_color_categorical = utils.is_categorical(df[color_col]) or not is_numeric(df[color_col])
+        is_color_categorical = is_categorical(df[color_col]) or not is_numeric(df[color_col])
         if is_color_categorical:
             df = df.copy()
             x_name, y_name = "__edvart_scatter_x", "__edvart_scatter_y"
diff --git a/edvart/report_sections/multivariate_analysis.py b/edvart/report_sections/multivariate_analysis.py
index a87f662..34ae547 100644
--- a/edvart/report_sections/multivariate_analysis.py
+++ b/edvart/report_sections/multivariate_analysis.py
@@ -12,11 +12,11 @@
 from IPython.display import Markdown, display
 from sklearn.preprocessing import StandardScaler
 
-from edvart.data_types import is_numeric
+from edvart.data_types import is_boolean, is_categorical, is_numeric
 from edvart.plots import scatter_plot_2d
 from edvart.report_sections.code_string_formatting import get_code, total_dedent
 from edvart.report_sections.section_base import ReportSection, Section, Verbosity
-from edvart.utils import discrete_colorscale, is_categorical
+from edvart.utils import discrete_colorscale
 
 try:
     from edvart.report_sections.umap import UMAP
@@ -533,11 +533,9 @@ def __init__(
             columns = [
                 col
                 for col in df.columns
-                if is_numeric(df[col])
-                or (
-                    is_categorical(df[col], nunique_max=nunique_max)
-                    and df[col].nunique() <= nunique_max
-                )
+                if is_categorical(df[col], unique_value_count_threshold=nunique_max)
+                or is_boolean(df[col])
+                or is_numeric(df[col])
             ]
             # If all columns are numeric we don't want to list them all in the generated call
             # Setting columns to None will result in the columns argument not being included
@@ -740,10 +738,10 @@ def __init__(
             columns = [
                 col
                 for col in df.columns
-                if (
-                    is_categorical(df[col], nunique_max=nunique_max)
-                    and df[col].nunique() <= nunique_max
-                )
+                if is_categorical(
+                    df[col],
+                    unique_value_count_threshold=nunique_max
+                ) or is_boolean(df[col])
             ]
 
             # If all columns are numeric we don't want to list them all in the generated call
diff --git a/edvart/report_sections/timeseries_analysis/time_series_line_plot.py b/edvart/report_sections/timeseries_analysis/time_series_line_plot.py
index e5f8cc9..96ec58e 100644
--- a/edvart/report_sections/timeseries_analysis/time_series_line_plot.py
+++ b/edvart/report_sections/timeseries_analysis/time_series_line_plot.py
@@ -8,8 +8,7 @@
 import plotly.graph_objects as go
 from IPython.display import Markdown, display
 
-from edvart import utils
-from edvart.data_types import is_numeric
+from edvart.data_types import is_categorical, is_numeric
 from edvart.decorators import check_index_time_ascending
 from edvart.report_sections.code_string_formatting import get_code, total_dedent
 from edvart.report_sections.section_base import Section, Verbosity
@@ -115,7 +114,7 @@ def _time_series_line_plot_colored(df, columns=None, color_col=None):
                     )
 
         layout = dict(xaxis_rangeslider_visible=True)
-        if not utils.is_categorical(df[color_col]):
+        if not is_categorical(df[color_col]):
             raise ValueError(f"Cannot color by non-categorical column `{color_col}`")
         if df[color_col].nunique() > 20:
             warnings.warn("Coloring by categorical column with many unique values!")
diff --git a/tests/test_multivariate_analysis.py b/tests/test_multivariate_analysis.py
index 9aa3dfc..0b190a7 100644
--- a/tests/test_multivariate_analysis.py
+++ b/tests/test_multivariate_analysis.py
@@ -197,7 +197,7 @@ def test_code_export_verbosity_medium_all_cols_valid():
     expected_code = [
         "pca_first_vs_second(df=df)",
         "pca_explained_variance(df=df)",
-        "parallel_categories(df=df)",
+        "parallel_categories(df=df, columns=['col2'])",
     ]
 
     assert len(exported_code) == len(expected_code)
@@ -228,14 +228,14 @@ def test_generated_code_verobsity_1():
                 )"""
             ),
             "parallel_coordinates(df=df)",
-            "parallel_categories(df=df)",
+            "parallel_categories(df=df, columns=['B'])",
         ]
     else:
         expected_code = [
             "pca_first_vs_second(df=df, columns=['A', 'C', 'D'])",
             "pca_explained_variance(df=df, columns=['A', 'C', 'D'])",
             "parallel_coordinates(df=df)",
-            "parallel_categories(df=df)",
+            "parallel_categories(df=df, columns=['B'])",
         ]
 
     assert len(exported_code) == len(expected_code)
@@ -275,7 +275,7 @@ def test_generated_code_verobsity_2():
             (
                 get_code(utils.discrete_colorscale),
                 get_code(multivariate_analysis.ParallelCategories.parallel_categories),
-                "parallel_categories(df=df)",
+                "parallel_categories(df=df, columns=['B'])",
             )
         ),
     ]
@@ -352,7 +352,7 @@ def test_verbosity_low_different_subsection_verbosities():
     expected_subsections_str = ", ".join(expected_subsections)
     expected_code = [
         "multivariate_analysis(df=df, " f"subsections=[{expected_subsections_str}])",
-        "parallel_categories(df=df)",
+        "parallel_categories(df=df, columns=['B'])",
         "\n\n".join(
             (
                 get_code(utils.discrete_colorscale),