From 500ee8c6c7f34ef88ba1953b33181f1a19f50c80 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Michal=20Bel=C3=A1k?= Date: Tue, 10 Oct 2023 15:46:32 +0200 Subject: [PATCH] feat: add support for pandas 2.0 (#162) Resolves #6 --- edvart/data_types.py | 19 +++++---- edvart/utils.py | 29 +++++++++++--- pyproject.toml | 6 ++- tests/pyarrow_utils.py | 8 ++++ tests/test_bivariate_analysis.py | 11 +++-- tests/test_data_type_inference.py | 8 ++-- tests/test_group_analysis.py | 49 ++++++++++++++--------- tests/test_multivariate_analysis.py | 43 +++++++++++++------- tests/test_univariate_analysis_section.py | 30 ++++++++++---- tests/test_utils.py | 7 +++- 10 files changed, 145 insertions(+), 65 deletions(-) create mode 100644 tests/pyarrow_utils.py diff --git a/edvart/data_types.py b/edvart/data_types.py index a614377..cf51d65 100644 --- a/edvart/data_types.py +++ b/edvart/data_types.py @@ -3,6 +3,13 @@ import numpy as np import pandas as pd +try: + import pyarrow # pylint: disable=unused-import +except ImportError: + PYARROW_PANDAS_BACKEND_AVAILABLE = False +else: + PYARROW_PANDAS_BACKEND_AVAILABLE = pd.__version__ >= "2.0" + class DataType(IntEnum): """Class describe possible data types.""" @@ -83,13 +90,7 @@ def is_numeric(series: pd.Series) -> bool: """ if is_missing(series): return False - # When an unknown dtype is encountered, `np.issubdtype(series.dtype, np.number)` - # raises a TypeError. This happens for example if `series` is `pd.Categorical` - # If the dtype is unknown, we treat it as non-numeric, therefore return False. - try: - return np.issubdtype(series.dtype, np.number) - except TypeError: - return False + return pd.api.types.is_numeric_dtype(series) def is_missing(series: pd.Series) -> bool: @@ -177,9 +178,7 @@ def is_date(series: pd.Series) -> bool: if contains_numerics: return False try: - converted_series = pd.to_datetime( - series.dropna(), errors="coerce", infer_datetime_format=True - ) + converted_series = pd.to_datetime(series.dropna(), errors="coerce") except ValueError: return False return converted_series.notna().all() diff --git a/edvart/utils.py b/edvart/utils.py index b273b4e..588aa73 100755 --- a/edvart/utils.py +++ b/edvart/utils.py @@ -4,6 +4,7 @@ import pandas as pd import statsmodels.api as sm +from scipy import stats from edvart.data_types import is_numeric @@ -74,9 +75,7 @@ def reindex_to_datetime( Reindexed df. """ df = df.copy() - new_index = pd.to_datetime( - df[datetime_column], unit=unit, origin=origin, infer_datetime_format=True - ) + new_index = pd.to_datetime(df[datetime_column], unit=unit, origin=origin) if keep_index is not None: df[keep_index] = df.index df = df.drop(datetime_column, axis="columns") @@ -211,6 +210,8 @@ def median_absolute_deviation(series: pd.Series) -> float: ------- float """ + if series.isnull().all(): + return float("nan") return median((series - series.mean()).abs()) @@ -243,6 +244,8 @@ def minimum(series: pd.Series) -> float: ------- float """ + if series.isnull().all(): + return float("nan") return series.min() @@ -259,6 +262,8 @@ def maximum(series: pd.Series) -> float: ------- float """ + if series.isnull().all(): + return float("nan") return series.max() @@ -275,6 +280,8 @@ def quartile1(series: pd.Series) -> float: ------- float """ + if series.isnull().all(): + return float("nan") return series.quantile(0.25) @@ -291,6 +298,8 @@ def quartile3(series: pd.Series) -> float: ------- float """ + if series.isnull().all(): + return float("nan") return series.quantile(0.75) @@ -307,6 +316,8 @@ def mean(series: pd.Series) -> float: ------- float """ + if series.isnull().all(): + return float("nan") return series.mean() @@ -393,6 +404,8 @@ def std(series: pd.Series) -> float: ------- float """ + if series.isnull().all(): + return float("nan") return series.std() @@ -409,6 +422,8 @@ def mad(series: pd.Series) -> Any: ------- float """ + if series.isnull().all(): + return float("nan") return (series - series.mean()).abs().mean() @@ -425,7 +440,9 @@ def kurtosis(series: pd.Series) -> Any: ------- float """ - return series.kurtosis() + if series.isnull().all(): + return float("nan") + return stats.kurtosis(series) def skewness(series: pd.Series) -> Any: @@ -441,7 +458,9 @@ def skewness(series: pd.Series) -> Any: ------- float """ - return series.skew() + if series.isnull().all(): + return float("nan") + return stats.skew(series) def sum_(series: pd.Series) -> float: diff --git a/pyproject.toml b/pyproject.toml index e1ebb24..132a2f7 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -11,7 +11,7 @@ exclude = ["tests"] [tool.poetry.dependencies] python = ">=3.8, <3.12" ipykernel = "*" -pandas = "^1.5" +pandas = ">=1.5, <2.1" numpy = "*" matplotlib = "*" seaborn = "^0.12" @@ -28,10 +28,12 @@ umap-learn = { version = "^0.5.4", optional = true} # which also installs an older version of llmvlite, which is incompatible # with newer version of LLVM binaries. numba = { version = "^0.57", optional = true } +pyarrow = { version = "^13.0.0", optional = true } [tool.poetry.extras] umap = ["umap-learn", "numba"] -all = ["umap-learn", "numba"] +arrow = ["pyarrow"] +all = ["umap-learn", "numba", "arrow"] [tool.poetry.dev-dependencies] pytest-cov = "^2.8" diff --git a/tests/pyarrow_utils.py b/tests/pyarrow_utils.py new file mode 100644 index 0000000..6bbc8fe --- /dev/null +++ b/tests/pyarrow_utils.py @@ -0,0 +1,8 @@ +import pytest + +from edvart.data_types import PYARROW_PANDAS_BACKEND_AVAILABLE + +if PYARROW_PANDAS_BACKEND_AVAILABLE: + pyarrow_parameterize = pytest.mark.parametrize("pyarrow_dtypes", [False, True]) +else: + pyarrow_parameterize = pytest.mark.parametrize("pyarrow_dtypes", [False]) diff --git a/tests/test_bivariate_analysis.py b/tests/test_bivariate_analysis.py index d1ad1eb..1aaccef 100644 --- a/tests/test_bivariate_analysis.py +++ b/tests/test_bivariate_analysis.py @@ -9,9 +9,13 @@ from edvart.report_sections.code_string_formatting import get_code from edvart.report_sections.section_base import Verbosity +from .pyarrow_utils import pyarrow_parameterize -def get_test_df() -> pd.DataFrame: + +def get_test_df(pyarrow_dtypes: bool = False) -> pd.DataFrame: test_df = pd.DataFrame(data=[[1.1, "a"], [2.2, "b"], [3.3, "c"]], columns=["A", "B"]) + if pyarrow_dtypes: + test_df = test_df.convert_dtypes(dtype_backend="pyarrow") return test_df @@ -407,9 +411,10 @@ def test_imports_verbosity_low_different_subsection_verbosities(): assert set(exported_imports) == set(expected_imports) -def test_show(): +@pyarrow_parameterize +def test_show(pyarrow_dtypes: bool): bivariate_section = BivariateAnalysis() with warnings.catch_warnings(): warnings.simplefilter("ignore", UserWarning) with redirect_stdout(None): - bivariate_section.show(get_test_df()) + bivariate_section.show(get_test_df(pyarrow_dtypes=pyarrow_dtypes)) diff --git a/tests/test_data_type_inference.py b/tests/test_data_type_inference.py index 4411c5a..aedc9f2 100644 --- a/tests/test_data_type_inference.py +++ b/tests/test_data_type_inference.py @@ -10,7 +10,9 @@ def test_inference(): == data_types.DataType.NUMERIC ), "Should be numeric type" assert ( - data_types.infer_data_type(pd.Series(["2014-01-01", "2014-01-02", "2014-12-03 14:05:02"])) + data_types.infer_data_type( + pd.Series(["2014-01-01 12:05:02", "2014-01-02 13:05:02", "2014-12-03 14:05:02"]) + ) == data_types.DataType.DATE ), "Should be date type" assert ( @@ -95,13 +97,13 @@ def test_boolean_series(): def test_date_series(): assert data_types.is_date( - pd.Series(["2014-01-01", "2014-01-02", "2014-12-03 14:05:02"]) + pd.Series(["2014-01-01 12:05:02", "2014-01-02 13:05:02", "2014-12-03 14:05:02"]) ), "Should be type date" assert data_types.is_date( pd.Series(["Mar 12 2018", "Dec 12 2018", "Jan 21 2020"]) ), "Should be type date" assert not data_types.is_date( - pd.Series(["2014-01-01", "2014-01-02", "2014-12-03 14:05:02", "nan"]) + pd.Series(["2014-01-01", "2014-01-02", "2014-12-03T14:05:02", "nan"]) ), "Should not be type date" assert not data_types.is_date( pd.Series(["2014-01-01", "2014-01-02", "2014-12-03 14:05:02", 1, 2, 3]) diff --git a/tests/test_group_analysis.py b/tests/test_group_analysis.py index b0710e3..257dace 100644 --- a/tests/test_group_analysis.py +++ b/tests/test_group_analysis.py @@ -21,18 +21,23 @@ ) from edvart.report_sections.section_base import Verbosity +from .pyarrow_utils import pyarrow_parameterize + # Workaround to prevent multiple browser tabs opening with figures plotly.io.renderers.default = "json" -def get_test_df(): - return pd.DataFrame( +def get_test_df(pyarrow_dtypes: bool = False) -> pd.DataFrame: + test_df = pd.DataFrame( data=[ ["P" if np.random.uniform() < 0.4 else "N", 1.5 * i, "X" if i % 2 == 0 else "Y"] for i in range(60) ], columns=["A", "B", "C"], ) + if pyarrow_dtypes: + test_df = test_df.convert_dtypes(dtype_backend="pyarrow") + return test_df def test_default_config_verbosity(): @@ -47,15 +52,19 @@ def test_invalid_verbosities(): GroupAnalysis(groupby=[], verbosity=-1) -def test_groupby_nonexistent_col(): +@pyarrow_parameterize +def test_groupby_nonexistent_col(pyarrow_dtypes: bool): with pytest.raises(ValueError): - show_group_analysis(df=get_test_df(), groupby=["non-existent"]) + show_group_analysis(df=get_test_df(pyarrow_dtypes=pyarrow_dtypes), groupby=["non-existent"]) with pytest.raises(ValueError): - group_missing_values(df=get_test_df(), groupby=["non-existent"]) + group_missing_values( + df=get_test_df(pyarrow_dtypes=pyarrow_dtypes), groupby=["non-existent"] + ) -def test_static_methods(): - df = get_test_df() +@pyarrow_parameterize +def test_static_methods(pyarrow_dtypes: bool): + df = get_test_df(pyarrow_dtypes=pyarrow_dtypes) with redirect_stdout(None): show_group_analysis(df=df, groupby="C") show_group_analysis(df=df, groupby=["C"], columns=["A"]) @@ -80,8 +89,9 @@ def test_static_methods(): overlaid_histograms(df, groupby=["B"], column="B") -def test_code_export_verbosity_low(): - df = get_test_df() +@pyarrow_parameterize +def test_code_export_verbosity_low(pyarrow_dtypes: bool): + df = get_test_df(pyarrow_dtypes=pyarrow_dtypes) group_section = GroupAnalysis(groupby="B", verbosity=Verbosity.LOW) # Export code @@ -96,8 +106,9 @@ def test_code_export_verbosity_low(): assert exported_code[0] == expected_code[0], "Exported code mismatch" -def test_code_export_verbosity_medium(): - df = get_test_df() +@pyarrow_parameterize +def test_code_export_verbosity_medium(pyarrow_dtypes: bool): + df = get_test_df(pyarrow_dtypes=pyarrow_dtypes) group_section = GroupAnalysis(groupby="A", verbosity=Verbosity.MEDIUM) # Export code @@ -122,8 +133,9 @@ def test_code_export_verbosity_medium(): assert expected_line == exported_line, "Exported code mismatch" -def test_code_export_verbosity_high(): - df = get_test_df() +@pyarrow_parameterize +def test_code_export_verbosity_high(pyarrow_dtypes: bool): + df = get_test_df(pyarrow_dtypes=pyarrow_dtypes) group_section = GroupAnalysis(groupby="A", verbosity=Verbosity.HIGH) # Export code @@ -176,8 +188,9 @@ def test_code_export_verbosity_high(): assert expected_line == exported_line, "Exported code mismatch" -def test_columns_parameter(): - df = get_test_df() +@pyarrow_parameterize +def test_columns_parameter(pyarrow_dtypes: bool): + df = get_test_df(pyarrow_dtypes=pyarrow_dtypes) ga = GroupAnalysis(groupby="A", columns=["B"]) assert ga.groupby == ["A"] assert ga.columns == ["B"] @@ -192,14 +205,14 @@ def test_columns_parameter(): def test_column_list_not_modified(): - df = get_test_df() columns = ["C"] GroupAnalysis(groupby=["A"], columns=columns) assert columns == ["C"], "Column list modified" -def test_show(): - df = get_test_df() +@pyarrow_parameterize +def test_show(pyarrow_dtypes: bool): + df = get_test_df(pyarrow_dtypes=pyarrow_dtypes) group_section = GroupAnalysis(groupby="A") with warnings.catch_warnings(): warnings.simplefilter("ignore", UserWarning) diff --git a/tests/test_multivariate_analysis.py b/tests/test_multivariate_analysis.py index db05ca2..459f65a 100644 --- a/tests/test_multivariate_analysis.py +++ b/tests/test_multivariate_analysis.py @@ -19,8 +19,10 @@ from edvart.report_sections.section_base import Verbosity from edvart.utils import select_numeric_columns +from .pyarrow_utils import pyarrow_parameterize -def get_test_df() -> pd.DataFrame: + +def get_test_df(pyarrow_dtypes: bool = False) -> pd.DataFrame: test_df = pd.DataFrame( data=[ [1.1, "a", 3.7, 3.9], @@ -31,6 +33,8 @@ def get_test_df() -> pd.DataFrame: ], columns=["A", "B", "C", "D"], ) + if pyarrow_dtypes: + test_df = test_df.convert_dtypes(dtype_backend="pyarrow") return test_df @@ -90,7 +94,6 @@ def test_verbosity_propagation(): def test_negative_verbosities(): - test_df = get_test_df() with pytest.raises(ValueError): MultivariateAnalysis(verbosity=-2) with pytest.raises(ValueError): @@ -129,8 +132,9 @@ def test_section_adding(): ), "Subsection should be UMAP" -def test_code_export_verbosity_low(): - df = get_test_df() +@pyarrow_parameterize +def test_code_export_verbosity_low(pyarrow_dtypes: bool): + df = get_test_df(pyarrow_dtypes=pyarrow_dtypes) multivariate_section = MultivariateAnalysis(verbosity=Verbosity.LOW) # Export code exported_cells = [] @@ -144,7 +148,8 @@ def test_code_export_verbosity_low(): assert exported_code[0] == expected_code[0], "Exported code mismatch" -def test_code_export_verbosity_low_with_subsections(): +@pyarrow_parameterize +def test_code_export_verbosity_low_with_subsections(pyarrow_dtypes: bool): subsec = MultivariateAnalysisSubsection subsections = [subsec.ParallelCategories, subsec.PCA, subsec.ParallelCoordinates, subsec.PCA] if UMAP_AVAILABLE: @@ -182,7 +187,8 @@ def test_code_export_verbosity_low_with_subsections(): assert exported_code[0] == expected_code[0], "Exported code mismatch" -def test_code_export_verbosity_medium_all_cols_valid(): +@pyarrow_parameterize +def test_code_export_verbosity_medium_all_cols_valid(pyarrow_dtypes: bool): all_numeric_df = pd.DataFrame( data=[[1.1, 1, -2], [2.2, 2, -5.3], [3.3, 3, 4]], columns=["col1", "col2", "col3"] ) @@ -210,9 +216,10 @@ def test_code_export_verbosity_medium_all_cols_valid(): assert expected_line == exported_line, "Exported code mismatch" -def test_generated_code_verbosity_1(): +@pyarrow_parameterize +def test_generated_code_verbosity_1(pyarrow_dtypes: bool): multivariate_section = MultivariateAnalysis(verbosity=Verbosity.MEDIUM) - df = get_test_df() + df = get_test_df(pyarrow_dtypes=pyarrow_dtypes) exported_cells = [] multivariate_section.add_cells(exported_cells, df=df) @@ -246,8 +253,9 @@ def test_generated_code_verbosity_1(): assert expected_line == exported_line, "Exported code mismatch" -def test_generated_code_verbosity_2(): - df = get_test_df() +@pyarrow_parameterize +def test_generated_code_verbosity_2(pyarrow_dtypes: bool): + df = get_test_df(pyarrow_dtypes=pyarrow_dtypes) multivariate_section = MultivariateAnalysis(verbosity=Verbosity.HIGH) multivariate_cells = [] @@ -307,10 +315,13 @@ def test_generated_code_verbosity_2(): assert expected_line == exported_line, "Exported code mismatch" -def test_verbosity_medium_non_categorical_col(): +@pyarrow_parameterize +def test_verbosity_medium_non_categorical_col(pyarrow_dtypes: bool): random_array = np.random.randint(low=1, high=40, size=(100, 3)) random_df = pd.DataFrame(data=random_array, columns=["integral", "floating", "cat"]) random_df = random_df.astype({"integral": int, "floating": float, "cat": "category"}) + if pyarrow_dtypes: + random_df = random_df.convert_dtypes(dtype_backend="pyarrow") subsec = MultivariateAnalysisSubsection multivariate_section = multivariate_analysis.MultivariateAnalysis( subsections=[subsec.ParallelCategories], verbosity=Verbosity.MEDIUM @@ -327,7 +338,8 @@ def test_verbosity_medium_non_categorical_col(): assert expected_line == exported_line, "Exported code mismatch" -def test_verbosity_low_different_subsection_verbosities(): +@pyarrow_parameterize +def test_verbosity_low_different_subsection_verbosities(pyarrow_dtypes: bool): subsections = [ MultivariateAnalysisSubsection.PCA, MultivariateAnalysisSubsection.PCA, @@ -336,7 +348,7 @@ def test_verbosity_low_different_subsection_verbosities(): ] if UMAP_AVAILABLE: subsections.insert(2, MultivariateAnalysisSubsection.UMAP) - df = get_test_df() + df = get_test_df(pyarrow_dtypes=pyarrow_dtypes) multivariate_section = MultivariateAnalysis( verbosity=Verbosity.LOW, subsections=subsections, @@ -443,8 +455,9 @@ def test_imports_verbosity_low_different_subsection_verbosities(): assert set(exported_imports) == set(expected_imports) -def test_show(): - df = get_test_df() +@pyarrow_parameterize +def test_show(pyarrow_dtypes: bool): + df = get_test_df(pyarrow_dtypes=pyarrow_dtypes) multivariate_section = MultivariateAnalysis() with warnings.catch_warnings(): warnings.simplefilter("ignore", UserWarning) diff --git a/tests/test_univariate_analysis_section.py b/tests/test_univariate_analysis_section.py index 3dd0b0f..76e0ce8 100644 --- a/tests/test_univariate_analysis_section.py +++ b/tests/test_univariate_analysis_section.py @@ -9,6 +9,16 @@ from edvart.report_sections.code_string_formatting import code_dedent, get_code from edvart.report_sections.section_base import Verbosity +from .pyarrow_utils import pyarrow_parameterize + + +def get_test_df(pyarrow_dtypes: bool) -> pd.DataFrame: + test_df = pd.DataFrame(data=[[1.9, "a"], [2.1, "b"], [3.3, "c"]], columns=["A", "B"]) + if pyarrow_dtypes: + test_df = test_df.convert_dtypes(dtype_backend="pyarrow") + + return test_df + def test_invalid_verbosity(): with pytest.raises(ValueError): @@ -21,8 +31,9 @@ def test_invalid_verbosity(): univariate_analysis.UnivariateAnalysis(verbosity="1") -def test_code_export_verbosity_low(): - test_df = pd.DataFrame(data=[[1.9, "a"], [2.1, "b"], [3.3, "c"]], columns=["A", "B"]) +@pyarrow_parameterize +def test_code_export_verbosity_low(pyarrow_dtypes: bool): + test_df = get_test_df(pyarrow_dtypes=pyarrow_dtypes) # Construct univariate analysis section univariate_section = univariate_analysis.UnivariateAnalysis(verbosity=Verbosity.LOW) # Export code @@ -36,8 +47,9 @@ def test_code_export_verbosity_low(): assert exported_code[0] == expected_code[0], "Exported code mismatch" -def test_code_export_verbosity_medium(): - test_df = pd.DataFrame(data=[[1.9, "a"], [2.1, "b"], [3.3, "c"]], columns=["A", "B"]) +@pyarrow_parameterize +def test_code_export_verbosity_medium(pyarrow_dtypes: bool): + test_df = get_test_df(pyarrow_dtypes=pyarrow_dtypes) # Construct univariate analysis section univariate_section = univariate_analysis.UnivariateAnalysis(verbosity=Verbosity.MEDIUM) # Export code @@ -55,8 +67,9 @@ def test_code_export_verbosity_medium(): assert exported_code[i] == expected_code[i], "Exported code mismatch" -def test_code_export_verbosity_high(): - test_df = pd.DataFrame(data=[[1.9, "a"], [2.1, "b"], [3.3, "c"]], columns=["A", "B"]) +@pyarrow_parameterize +def test_code_export_verbosity_high(pyarrow_dtypes: bool): + test_df = get_test_df(pyarrow_dtypes=pyarrow_dtypes) # Construct univariate analysis section univariate_section = univariate_analysis.UnivariateAnalysis(verbosity=Verbosity.HIGH) # Export code @@ -105,8 +118,9 @@ def test_code_export_verbosity_high(): assert exported_code[i] == expected_code[i], "Exported code mismatch" -def test_show(): - test_df = pd.DataFrame(data=[[1.9, "a"], [2.1, "b"], [3.3, "c"]], columns=["A", "B"]) +@pyarrow_parameterize +def test_show(pyarrow_dtypes: bool): + test_df = get_test_df(pyarrow_dtypes=pyarrow_dtypes) univariate_section = univariate_analysis.UnivariateAnalysis() with warnings.catch_warnings(): warnings.simplefilter("ignore", UserWarning) diff --git a/tests/test_utils.py b/tests/test_utils.py index 7eb4856..aad6be5 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -7,9 +7,14 @@ from edvart import utils +from .pyarrow_utils import pyarrow_parameterize -def test_full_na_series(): + +@pyarrow_parameterize +def test_full_na_series(pyarrow_dtypes: bool): series = pd.Series([None, np.nan, None]) + if pyarrow_dtypes: + series = series.convert_dtypes(dtype_backend="pyarrow") for func in ( utils.quartile1, utils.median,