Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat!: Remove utils.is_categorical #108

Merged
merged 2 commits into from
Sep 6, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 3 additions & 4 deletions edvart/plots.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,7 @@
import pandas as pd
import plotly.graph_objs as go

from edvart import utils
from edvart.data_types import is_numeric
from edvart.data_types import is_categorical, is_numeric

# Multiplier which makes plotly interactive plots (size in pixels) and
# matplotlib plots (size in inches) about the same size
Expand Down Expand Up @@ -101,7 +100,7 @@ def _scatter_plot_2d_noninteractive(
) -> None:
_fig, ax = plt.subplots(figsize=figsize)
if color_col is not None:
is_color_categorical = utils.is_categorical(df[color_col]) or not is_numeric(df[color_col])
is_color_categorical = is_categorical(df[color_col]) or not is_numeric(df[color_col])
if is_color_categorical:
color_categorical = pd.Categorical(df[color_col])
color_codes = color_categorical.codes
Expand Down Expand Up @@ -163,7 +162,7 @@ def _scatter_plot_2d_interactive(
layout.yaxis.scaleratio = 1
fig = go.Figure(layout=layout)
if color_col is not None:
is_color_categorical = utils.is_categorical(df[color_col]) or not is_numeric(df[color_col])
is_color_categorical = is_categorical(df[color_col]) or not is_numeric(df[color_col])
if is_color_categorical:
df = df.copy()
x_name, y_name = "__edvart_scatter_x", "__edvart_scatter_y"
Expand Down
18 changes: 7 additions & 11 deletions edvart/report_sections/multivariate_analysis.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,11 +12,11 @@
from IPython.display import Markdown, display
from sklearn.preprocessing import StandardScaler

from edvart.data_types import is_numeric
from edvart.data_types import is_boolean, is_categorical, is_numeric
from edvart.plots import scatter_plot_2d
from edvart.report_sections.code_string_formatting import get_code, total_dedent
from edvart.report_sections.section_base import ReportSection, Section, Verbosity
from edvart.utils import discrete_colorscale, is_categorical
from edvart.utils import discrete_colorscale

try:
from edvart.report_sections.umap import UMAP
Expand Down Expand Up @@ -533,11 +533,9 @@ def __init__(
columns = [
col
for col in df.columns
if is_numeric(df[col])
or (
is_categorical(df[col], nunique_max=nunique_max)
and df[col].nunique() <= nunique_max
)
if is_categorical(df[col], unique_value_count_threshold=nunique_max)
or is_boolean(df[col])
or is_numeric(df[col])
]
# If all columns are numeric we don't want to list them all in the generated call
# Setting columns to None will result in the columns argument not being included
Expand Down Expand Up @@ -740,10 +738,8 @@ def __init__(
columns = [
col
for col in df.columns
if (
is_categorical(df[col], nunique_max=nunique_max)
and df[col].nunique() <= nunique_max
)
if is_categorical(df[col], unique_value_count_threshold=nunique_max)
or is_boolean(df[col])
]

# If all columns are numeric we don't want to list them all in the generated call
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,7 @@
import plotly.graph_objects as go
from IPython.display import Markdown, display

from edvart import utils
from edvart.data_types import is_numeric
from edvart.data_types import is_categorical, is_numeric
from edvart.decorators import check_index_time_ascending
from edvart.report_sections.code_string_formatting import get_code, total_dedent
from edvart.report_sections.section_base import Section, Verbosity
Expand Down Expand Up @@ -115,7 +114,7 @@ def _time_series_line_plot_colored(df, columns=None, color_col=None):
)

layout = dict(xaxis_rangeslider_visible=True)
if not utils.is_categorical(df[color_col]):
if not is_categorical(df[color_col]):
raise ValueError(f"Cannot color by non-categorical column `{color_col}`")
if df[color_col].nunique() > 20:
warnings.warn("Coloring by categorical column with many unique values!")
Expand Down
24 changes: 0 additions & 24 deletions edvart/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,30 +8,6 @@
import seaborn as sns
import statsmodels.api as sm

from edvart.data_types import is_numeric


def is_categorical(series: pd.Series, nunique_max: int = 20) -> bool:
"""
A heuristic of whether a series is categorical or numerical.

Parameters
----------
series: pd.Series
Input series
nunique_max: int (default = 20)
Maximum number of unique values for a numeric series to be regarded as categorical.
No limit on number of unique values if set to a negative number.

Returns
-------
bool
True if series contains categorical values, otherwise False
"""
return (
(nunique_max < 0 or series.nunique() <= nunique_max) or not is_numeric(series)
) and not pd.core.dtypes.common.is_datetime_or_timedelta_dtype(series)


def top_frequent_values(series: pd.Series, n_top: int = 10) -> Dict[Any, float]:
"""
Expand Down
10 changes: 5 additions & 5 deletions tests/test_multivariate_analysis.py
Original file line number Diff line number Diff line change
Expand Up @@ -197,7 +197,7 @@ def test_code_export_verbosity_medium_all_cols_valid():
expected_code = [
"pca_first_vs_second(df=df)",
"pca_explained_variance(df=df)",
"parallel_categories(df=df)",
"parallel_categories(df=df, columns=['col2'])",
]

assert len(exported_code) == len(expected_code)
Expand Down Expand Up @@ -228,14 +228,14 @@ def test_generated_code_verobsity_1():
)"""
),
"parallel_coordinates(df=df)",
"parallel_categories(df=df)",
"parallel_categories(df=df, columns=['B'])",
]
else:
expected_code = [
"pca_first_vs_second(df=df, columns=['A', 'C', 'D'])",
"pca_explained_variance(df=df, columns=['A', 'C', 'D'])",
"parallel_coordinates(df=df)",
"parallel_categories(df=df)",
"parallel_categories(df=df, columns=['B'])",
]

assert len(exported_code) == len(expected_code)
Expand Down Expand Up @@ -275,7 +275,7 @@ def test_generated_code_verobsity_2():
(
get_code(utils.discrete_colorscale),
get_code(multivariate_analysis.ParallelCategories.parallel_categories),
"parallel_categories(df=df)",
"parallel_categories(df=df, columns=['B'])",
)
),
]
Expand Down Expand Up @@ -352,7 +352,7 @@ def test_verbosity_low_different_subsection_verbosities():
expected_subsections_str = ", ".join(expected_subsections)
expected_code = [
"multivariate_analysis(df=df, " f"subsections=[{expected_subsections_str}])",
"parallel_categories(df=df)",
"parallel_categories(df=df, columns=['B'])",
"\n\n".join(
(
get_code(utils.discrete_colorscale),
Expand Down
1 change: 0 additions & 1 deletion tests/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,6 @@ def test_full_na_series():
warnings.simplefilter(action="error", category=RuntimeWarning)
result = func(series)
assert math.isnan(float(result))
assert utils.is_categorical(series)
assert utils.num_unique_values(series) == 0


Expand Down