Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat!: remove parameter string_representation from infer_data_type #115

Merged
merged 2 commits into from
Sep 4, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
39 changes: 18 additions & 21 deletions edvart/data_types.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
"""Module defines data types and helper function for recognizing them."""

from enum import IntEnum
from typing import Union

import numpy as np
import pandas as pd
Expand All @@ -22,38 +21,36 @@ def __str__(self):
return self.name.lower()


def infer_data_type(series: pd.Series, string_representation: bool = False) -> Union[DataType, str]:
# pylint: disable=too-many-return-statements
def infer_data_type(series: pd.Series) -> DataType:
"""Infers the data type of the series passed in.

Parameters
----------
series : pd.Series
Series from which to infer data type.
string_representation : bool
Whether to return the resulting data type as DataType enum value or string.

Returns
-------
DataType : Union[DataType, str]
Inferred custom edvart data type or its string representation.
DataType
Inferred custom edvart data type.
"""
ret = None
if series.empty:
return DataType.UNKNOWN
if is_missing(series):
ret = DataType.MISSING
return DataType.MISSING
if is_boolean(series):
ret = DataType.BOOLEAN
elif is_date(series):
ret = DataType.DATE
elif is_unique(series):
ret = DataType.UNIQUE
elif is_categorical(series):
ret = DataType.CATEGORICAL
elif is_numeric(series):
ret = DataType.NUMERIC
else:
ret = DataType.UNKNOWN

return str(ret) if string_representation else ret
return DataType.BOOLEAN
if is_date(series):
return DataType.DATE
if is_unique(series):
return DataType.UNIQUE
if is_categorical(series):
return DataType.CATEGORICAL
if is_numeric(series):
return DataType.NUMERIC

return DataType.UNKNOWN


def is_unique(series: pd.Series) -> bool:
Expand Down
3 changes: 1 addition & 2 deletions edvart/report_sections/dataset_overview.py
Original file line number Diff line number Diff line change
Expand Up @@ -378,10 +378,9 @@ def data_types(df: pd.DataFrame, columns: Optional[List[str]] = None) -> None:
if columns is not None:
df = df[columns]
dtypes = df.apply(
func=infer_data_type,
func=lambda x_: str(infer_data_type(x_)),
axis=0,
result_type="expand",
string_representation=True,
)

# Convert result to frame for viewing
Expand Down
6 changes: 3 additions & 3 deletions edvart/report_sections/univariate_analysis.py
Original file line number Diff line number Diff line change
Expand Up @@ -266,8 +266,8 @@ def univariate_analysis(df: pd.DataFrame, columns: Optional[List[str]] = None) -
display(Markdown(f"## *{col} - NULL*"))
display(Markdown("The column contains only null values."))
continue
data_type_name = infer_data_type(df[col], string_representation=True)
data_type = infer_data_type(df[col])
data_type_name = str(data_type)
display(Markdown(f"## *{col} - {data_type_name}*"))
if data_type in (DataType.CATEGORICAL, DataType.BOOLEAN):
UnivariateAnalysis.top_most_frequent(df[col])
Expand Down Expand Up @@ -375,8 +375,8 @@ def add_cells(self, cells: List[Dict[str, Any]]) -> None:
display(Markdown(f"## *{col} - NULL*"))
display(Markdown("The column contains only null values."))
continue
data_type_name = infer_data_type(self.df[col], string_representation=True)
data_type = infer_data_type(self.df[col])
data_type_name = str(data_type)
column_header = nbfv4.new_markdown_cell(f"## *{col} - {data_type_name}*")
cells.append(column_header)
if data_type in (DataType.CATEGORICAL, DataType.BOOLEAN):
Expand Down Expand Up @@ -429,8 +429,8 @@ def show(self, df: pd.DataFrame) -> None:
display(Markdown(f"## *{col} - NULL*"))
display(Markdown("The column contains only null values."))
continue
data_type_name = infer_data_type(df[col], string_representation=True)
data_type = infer_data_type(df[col])
data_type_name = str(data_type)
display(Markdown(f"## *{col} - {data_type_name}*"))
if data_type in (DataType.CATEGORICAL, DataType.BOOLEAN):
UnivariateAnalysis.top_most_frequent(df[col])
Expand Down