From 79856bca5a0d3d49abe9c37502e9c4e07829234c Mon Sep 17 00:00:00 2001 From: Alex Barros Date: Fri, 3 Mar 2023 10:07:49 -0300 Subject: [PATCH] feat: add type schema (#1274) * feat: allows user to define variable types --- src/ydata_profiling/compare_reports.py | 4 ++ .../model/pandas/summary_pandas.py | 11 +++++- src/ydata_profiling/model/typeset.py | 13 ++++++- src/ydata_profiling/profile_report.py | 5 ++- tests/unit/test_typeset_default.py | 37 ++++++++++++++++++- 5 files changed, 65 insertions(+), 5 deletions(-) diff --git a/src/ydata_profiling/compare_reports.py b/src/ydata_profiling/compare_reports.py index cf4578804..e108d5ea5 100644 --- a/src/ydata_profiling/compare_reports.py +++ b/src/ydata_profiling/compare_reports.py @@ -134,6 +134,10 @@ def _compare_profile_report_preprocess( config.html.style.primary_colors ) + # enforce same types + for report in reports[1:]: + report._typeset = reports[0].typeset + # Obtain description sets descriptions = [report.get_description() for report in reports] for label, description in zip(labels, descriptions): diff --git a/src/ydata_profiling/model/pandas/summary_pandas.py b/src/ydata_profiling/model/pandas/summary_pandas.py index 89fe2f1f3..bbb401fd0 100644 --- a/src/ydata_profiling/model/pandas/summary_pandas.py +++ b/src/ydata_profiling/model/pandas/summary_pandas.py @@ -12,6 +12,7 @@ from ydata_profiling.config import Settings from ydata_profiling.model.summarizer import BaseSummarizer from ydata_profiling.model.summary import describe_1d, get_series_descriptions +from ydata_profiling.model.typeset import ProfilingTypeSet from ydata_profiling.utils.dataframe import sort_column_names @@ -37,8 +38,13 @@ def pandas_describe_1d( # Make sure pd.NA is not in the series series = series.fillna(np.nan) - # get `infer_dtypes` (bool) from config - if config.infer_dtypes: + if ( + isinstance(typeset, ProfilingTypeSet) + and typeset.type_schema + and series.name in typeset.type_schema + ): + vtype = typeset.type_schema[series.name] + elif config.infer_dtypes: # Infer variable types vtype = typeset.infer_type(series) series = typeset.cast_to_inferred(series) @@ -47,6 +53,7 @@ def pandas_describe_1d( # [new dtypes, changed using `astype` function are now considered] vtype = typeset.detect_type(series) + typeset.type_schema[series.name] = vtype return summarizer.summarize(config, series, dtype=vtype) diff --git a/src/ydata_profiling/model/typeset.py b/src/ydata_profiling/model/typeset.py index cb1432bb8..489c1d5e5 100644 --- a/src/ydata_profiling/model/typeset.py +++ b/src/ydata_profiling/model/typeset.py @@ -241,7 +241,7 @@ def is_timedependent(series: pd.Series) -> bool: class ProfilingTypeSet(visions.VisionsTypeset): - def __init__(self, config: Settings): + def __init__(self, config: Settings, type_schema: dict = None): self.config = config types = typeset_types(config) @@ -249,3 +249,14 @@ def __init__(self, config: Settings): with warnings.catch_warnings(): warnings.filterwarnings("ignore", category=UserWarning) super().__init__(types) + + self.type_schema = self._init_type_schema(type_schema or {}) + + def _init_type_schema(self, type_schema: dict) -> dict: + return {k: self._get_type(v) for k, v in type_schema.items()} + + def _get_type(self, type_name: str) -> visions.VisionsBaseType: + for t in self.types: + if t.__name__.lower() == type_name.lower(): + return t + raise ValueError(f"Type [{type_name}] not found.") diff --git a/src/ydata_profiling/profile_report.py b/src/ydata_profiling/profile_report.py index 1cc872557..8e707155b 100644 --- a/src/ydata_profiling/profile_report.py +++ b/src/ydata_profiling/profile_report.py @@ -68,6 +68,7 @@ def __init__( typeset: Optional[VisionsTypeset] = None, summarizer: Optional[BaseSummarizer] = None, config: Optional[Settings] = None, + type_schema: Optional[dict] = None, **kwargs, ): """Generate a ProfileReport based on a pandas or spark.sql DataFrame @@ -89,6 +90,7 @@ def __init__( sample: optional dict(name="Sample title", caption="Caption", data=pd.DataFrame()) typeset: optional user typeset to use for type inference summarizer: optional user summarizer to generate custom summary output + type_schema: optional dict containing pairs of `column name`: `type` **kwargs: other arguments, for valid arguments, check the default configuration file. """ self.__validate_inputs(df, minimal, tsmode, config_file, lazy) @@ -139,6 +141,7 @@ def __init__( self.config = report_config self._df_hash = None self._sample = sample + self._type_schema = type_schema self._typeset = typeset self._summarizer = summarizer @@ -230,7 +233,7 @@ def invalidate_cache(self, subset: Optional[str] = None) -> None: @property def typeset(self) -> Optional[VisionsTypeset]: if self._typeset is None: - self._typeset = ProfilingTypeSet(self.config) + self._typeset = ProfilingTypeSet(self.config, self._type_schema) return self._typeset @property diff --git a/tests/unit/test_typeset_default.py b/tests/unit/test_typeset_default.py index 3580163e8..11b42266b 100644 --- a/tests/unit/test_typeset_default.py +++ b/tests/unit/test_typeset_default.py @@ -1,5 +1,7 @@ import os +import numpy as np +import pandas as pd import pytest from visions.test.series import get_series from visions.test.utils import ( @@ -14,6 +16,7 @@ from tests.unit.test_utils import patch_arg from ydata_profiling.config import Settings from ydata_profiling.model.typeset import ProfilingTypeSet +from ydata_profiling.profile_report import ProfileReport base_path = os.path.abspath(os.path.dirname(__file__)) @@ -161,7 +164,7 @@ ) ) def test_contains(name, series, contains_type, member): - """Test the generated combinations for "series in type" + """Test the generated combinations for "series in type". Args: series: the series to test @@ -349,3 +352,35 @@ def test_conversion(name, source_type, relation_type, series, member): """ result, message = convert(name, source_type, relation_type, series, member) assert result, message + + +@pytest.fixture +def dataframe(size: int = 1000) -> pd.DataFrame: + return pd.DataFrame( + { + "boolean": np.random.choice([True, False], size=size), + "numeric": np.random.rand(size), + "categorical": np.random.choice(np.arange(5), size=size), + "timeseries": np.arange(size), + } + ) + + +def convertion_map() -> list: + types = { + "boolean": ["Categorical", "Unsupported"], + "numeric": ["Categorical", "Boolean", "Unsupported"], + "categorical": ["Numeric", "Boolean", "TimeSeries", "Unsupported"], + "timeseries": ["Numeric", "Boolean", "Categorical", "Unsupported"], + } + return [(k, {k: i}) for k, v in types.items() for i in v] + + +@pytest.mark.parametrize("column,type_schema", convertion_map()) +def test_type_schema(dataframe: pd.DataFrame, column: str, type_schema: dict): + prof = ProfileReport(dataframe[[column]], tsmode=True, type_schema=type_schema) + prof.get_description() + assert isinstance(prof.typeset, ProfilingTypeSet) + assert prof.typeset.type_schema[column] == prof.typeset._get_type( + type_schema[column] + )