From 131f18c5c25c5b257c68c80d17dc3e8a985ba55f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Michal=20Bel=C3=A1k?= Date: Thu, 31 Aug 2023 14:11:36 +0200 Subject: [PATCH 1/2] refactor: Use direct conversion to represent inferred data type. Replaces two calls to `infer_data_type`, once with `string_representation=True` and once with `string_representation=False`. --- edvart/report_sections/dataset_overview.py | 3 +-- edvart/report_sections/univariate_analysis.py | 6 +++--- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/edvart/report_sections/dataset_overview.py b/edvart/report_sections/dataset_overview.py index ef24427..6529f0b 100644 --- a/edvart/report_sections/dataset_overview.py +++ b/edvart/report_sections/dataset_overview.py @@ -378,10 +378,9 @@ def data_types(df: pd.DataFrame, columns: Optional[List[str]] = None) -> None: if columns is not None: df = df[columns] dtypes = df.apply( - func=infer_data_type, + func=lambda x_: str(infer_data_type(x_)), axis=0, result_type="expand", - string_representation=True, ) # Convert result to frame for viewing diff --git a/edvart/report_sections/univariate_analysis.py b/edvart/report_sections/univariate_analysis.py index d0599d4..dd35c98 100644 --- a/edvart/report_sections/univariate_analysis.py +++ b/edvart/report_sections/univariate_analysis.py @@ -266,8 +266,8 @@ def univariate_analysis(df: pd.DataFrame, columns: Optional[List[str]] = None) - display(Markdown(f"## *{col} - NULL*")) display(Markdown("The column contains only null values.")) continue - data_type_name = infer_data_type(df[col], string_representation=True) data_type = infer_data_type(df[col]) + data_type_name = str(data_type) display(Markdown(f"## *{col} - {data_type_name}*")) if data_type in (DataType.CATEGORICAL, DataType.BOOLEAN): UnivariateAnalysis.top_most_frequent(df[col]) @@ -375,8 +375,8 @@ def add_cells(self, cells: List[Dict[str, Any]]) -> None: display(Markdown(f"## *{col} - NULL*")) display(Markdown("The column contains only null values.")) continue - data_type_name = infer_data_type(self.df[col], string_representation=True) data_type = infer_data_type(self.df[col]) + data_type_name = str(data_type) column_header = nbfv4.new_markdown_cell(f"## *{col} - {data_type_name}*") cells.append(column_header) if data_type in (DataType.CATEGORICAL, DataType.BOOLEAN): @@ -429,8 +429,8 @@ def show(self, df: pd.DataFrame) -> None: display(Markdown(f"## *{col} - NULL*")) display(Markdown("The column contains only null values.")) continue - data_type_name = infer_data_type(df[col], string_representation=True) data_type = infer_data_type(df[col]) + data_type_name = str(data_type) display(Markdown(f"## *{col} - {data_type_name}*")) if data_type in (DataType.CATEGORICAL, DataType.BOOLEAN): UnivariateAnalysis.top_most_frequent(df[col]) From 02a5fe727b557dc949bced8e3b7855b5f429fa6b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Michal=20Bel=C3=A1k?= Date: Thu, 31 Aug 2023 14:12:54 +0200 Subject: [PATCH 2/2] feat!: Remove parameter `string_representation` from `infer_data_type`. The parameter is no longer used by in `edvart`. BREAKING CHANGE: Parameter `string_representation` of `edvart.data_types.infer_data_type` removed. Call `str` on the result instead to get the string representation. --- edvart/data_types.py | 39 ++++++++++++++++++--------------------- 1 file changed, 18 insertions(+), 21 deletions(-) diff --git a/edvart/data_types.py b/edvart/data_types.py index 406e9b8..4647b08 100644 --- a/edvart/data_types.py +++ b/edvart/data_types.py @@ -1,7 +1,6 @@ """Module defines data types and helper function for recognizing them.""" from enum import IntEnum -from typing import Union import numpy as np import pandas as pd @@ -22,38 +21,36 @@ def __str__(self): return self.name.lower() -def infer_data_type(series: pd.Series, string_representation: bool = False) -> Union[DataType, str]: +# pylint: disable=too-many-return-statements +def infer_data_type(series: pd.Series) -> DataType: """Infers the data type of the series passed in. Parameters ---------- series : pd.Series Series from which to infer data type. - string_representation : bool - Whether to return the resulting data type as DataType enum value or string. Returns ------- - DataType : Union[DataType, str] - Inferred custom edvart data type or its string representation. + DataType + Inferred custom edvart data type. """ - ret = None + if series.empty: + return DataType.UNKNOWN if is_missing(series): - ret = DataType.MISSING + return DataType.MISSING if is_boolean(series): - ret = DataType.BOOLEAN - elif is_date(series): - ret = DataType.DATE - elif is_unique(series): - ret = DataType.UNIQUE - elif is_categorical(series): - ret = DataType.CATEGORICAL - elif is_numeric(series): - ret = DataType.NUMERIC - else: - ret = DataType.UNKNOWN - - return str(ret) if string_representation else ret + return DataType.BOOLEAN + if is_date(series): + return DataType.DATE + if is_unique(series): + return DataType.UNIQUE + if is_categorical(series): + return DataType.CATEGORICAL + if is_numeric(series): + return DataType.NUMERIC + + return DataType.UNKNOWN def is_unique(series: pd.Series) -> bool: