Skip to content

Commit

Permalink
feat!: Remove utils.is_categorical (#108)
Browse files Browse the repository at this point in the history
Resolves #25 

BREAKING CHANGE: function `edvart.utils.is_categorical` is removed.
`edvart.data_types.is_categorical`
can be used instead, with similar behavior.
BREAKING CHANGE: Slightly changes behavior of column selection for some
plots, e.g. parallel categories is now stricter in choosing which
columns are considered as categorical.
  • Loading branch information
mbelak-dtml authored Sep 6, 2023
1 parent 77896b4 commit 39df303
Show file tree
Hide file tree
Showing 6 changed files with 17 additions and 48 deletions.
7 changes: 3 additions & 4 deletions edvart/plots.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,7 @@
import pandas as pd
import plotly.graph_objs as go

from edvart import utils
from edvart.data_types import is_numeric
from edvart.data_types import is_categorical, is_numeric

# Multiplier which makes plotly interactive plots (size in pixels) and
# matplotlib plots (size in inches) about the same size
Expand Down Expand Up @@ -101,7 +100,7 @@ def _scatter_plot_2d_noninteractive(
) -> None:
_fig, ax = plt.subplots(figsize=figsize)
if color_col is not None:
is_color_categorical = utils.is_categorical(df[color_col]) or not is_numeric(df[color_col])
is_color_categorical = is_categorical(df[color_col]) or not is_numeric(df[color_col])
if is_color_categorical:
color_categorical = pd.Categorical(df[color_col])
color_codes = color_categorical.codes
Expand Down Expand Up @@ -163,7 +162,7 @@ def _scatter_plot_2d_interactive(
layout.yaxis.scaleratio = 1
fig = go.Figure(layout=layout)
if color_col is not None:
is_color_categorical = utils.is_categorical(df[color_col]) or not is_numeric(df[color_col])
is_color_categorical = is_categorical(df[color_col]) or not is_numeric(df[color_col])
if is_color_categorical:
df = df.copy()
x_name, y_name = "__edvart_scatter_x", "__edvart_scatter_y"
Expand Down
18 changes: 7 additions & 11 deletions edvart/report_sections/multivariate_analysis.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,11 +12,11 @@
from IPython.display import Markdown, display
from sklearn.preprocessing import StandardScaler

from edvart.data_types import is_numeric
from edvart.data_types import is_boolean, is_categorical, is_numeric
from edvart.plots import scatter_plot_2d
from edvart.report_sections.code_string_formatting import get_code, total_dedent
from edvart.report_sections.section_base import ReportSection, Section, Verbosity
from edvart.utils import discrete_colorscale, is_categorical
from edvart.utils import discrete_colorscale

try:
from edvart.report_sections.umap import UMAP
Expand Down Expand Up @@ -533,11 +533,9 @@ def __init__(
columns = [
col
for col in df.columns
if is_numeric(df[col])
or (
is_categorical(df[col], nunique_max=nunique_max)
and df[col].nunique() <= nunique_max
)
if is_categorical(df[col], unique_value_count_threshold=nunique_max)
or is_boolean(df[col])
or is_numeric(df[col])
]
# If all columns are numeric we don't want to list them all in the generated call
# Setting columns to None will result in the columns argument not being included
Expand Down Expand Up @@ -740,10 +738,8 @@ def __init__(
columns = [
col
for col in df.columns
if (
is_categorical(df[col], nunique_max=nunique_max)
and df[col].nunique() <= nunique_max
)
if is_categorical(df[col], unique_value_count_threshold=nunique_max)
or is_boolean(df[col])
]

# If all columns are numeric we don't want to list them all in the generated call
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,7 @@
import plotly.graph_objects as go
from IPython.display import Markdown, display

from edvart import utils
from edvart.data_types import is_numeric
from edvart.data_types import is_categorical, is_numeric
from edvart.decorators import check_index_time_ascending
from edvart.report_sections.code_string_formatting import get_code, total_dedent
from edvart.report_sections.section_base import Section, Verbosity
Expand Down Expand Up @@ -115,7 +114,7 @@ def _time_series_line_plot_colored(df, columns=None, color_col=None):
)

layout = dict(xaxis_rangeslider_visible=True)
if not utils.is_categorical(df[color_col]):
if not is_categorical(df[color_col]):
raise ValueError(f"Cannot color by non-categorical column `{color_col}`")
if df[color_col].nunique() > 20:
warnings.warn("Coloring by categorical column with many unique values!")
Expand Down
24 changes: 0 additions & 24 deletions edvart/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,30 +8,6 @@
import seaborn as sns
import statsmodels.api as sm

from edvart.data_types import is_numeric


def is_categorical(series: pd.Series, nunique_max: int = 20) -> bool:
"""
A heuristic of whether a series is categorical or numerical.
Parameters
----------
series: pd.Series
Input series
nunique_max: int (default = 20)
Maximum number of unique values for a numeric series to be regarded as categorical.
No limit on number of unique values if set to a negative number.
Returns
-------
bool
True if series contains categorical values, otherwise False
"""
return (
(nunique_max < 0 or series.nunique() <= nunique_max) or not is_numeric(series)
) and not pd.core.dtypes.common.is_datetime_or_timedelta_dtype(series)


def top_frequent_values(series: pd.Series, n_top: int = 10) -> Dict[Any, float]:
"""
Expand Down
10 changes: 5 additions & 5 deletions tests/test_multivariate_analysis.py
Original file line number Diff line number Diff line change
Expand Up @@ -197,7 +197,7 @@ def test_code_export_verbosity_medium_all_cols_valid():
expected_code = [
"pca_first_vs_second(df=df)",
"pca_explained_variance(df=df)",
"parallel_categories(df=df)",
"parallel_categories(df=df, columns=['col2'])",
]

assert len(exported_code) == len(expected_code)
Expand Down Expand Up @@ -228,14 +228,14 @@ def test_generated_code_verbosity_1():
)"""
),
"parallel_coordinates(df=df)",
"parallel_categories(df=df)",
"parallel_categories(df=df, columns=['B'])",
]
else:
expected_code = [
"pca_first_vs_second(df=df, columns=['A', 'C', 'D'])",
"pca_explained_variance(df=df, columns=['A', 'C', 'D'])",
"parallel_coordinates(df=df)",
"parallel_categories(df=df)",
"parallel_categories(df=df, columns=['B'])",
]

assert len(exported_code) == len(expected_code)
Expand Down Expand Up @@ -275,7 +275,7 @@ def test_generated_code_verbosity_2():
(
get_code(utils.discrete_colorscale),
get_code(multivariate_analysis.ParallelCategories.parallel_categories),
"parallel_categories(df=df)",
"parallel_categories(df=df, columns=['B'])",
)
),
]
Expand Down Expand Up @@ -352,7 +352,7 @@ def test_verbosity_low_different_subsection_verbosities():
expected_subsections_str = ", ".join(expected_subsections)
expected_code = [
"multivariate_analysis(df=df, " f"subsections=[{expected_subsections_str}])",
"parallel_categories(df=df)",
"parallel_categories(df=df, columns=['B'])",
"\n\n".join(
(
get_code(utils.discrete_colorscale),
Expand Down
1 change: 0 additions & 1 deletion tests/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,6 @@ def test_full_na_series():
warnings.simplefilter(action="error", category=RuntimeWarning)
result = func(series)
assert math.isnan(float(result))
assert utils.is_categorical(series)
assert utils.num_unique_values(series) == 0


Expand Down

0 comments on commit 39df303

Please sign in to comment.