datamole-ai · lukany · Sep 4, 2023 · Aug 31, 2023 · Aug 31, 2023
@@ -1,7 +1,6 @@
 """Module defines data types and helper function for recognizing them."""
 
 from enum import IntEnum
-from typing import Union
 
 import numpy as np
 import pandas as pd
@@ -22,38 +21,36 @@ def __str__(self):
         return self.name.lower()
 
 
-def infer_data_type(series: pd.Series, string_representation: bool = False) -> Union[DataType, str]:
+# pylint: disable=too-many-return-statements
+def infer_data_type(series: pd.Series) -> DataType:
     """Infers the data type of the series passed in.
 
     Parameters
     ----------
     series : pd.Series
         Series from which to infer data type.
-    string_representation : bool
-        Whether to return the resulting data type as DataType enum value or string.
 
     Returns
     -------
-    DataType : Union[DataType, str]
-        Inferred custom edvart data type or its string representation.
+    DataType
+        Inferred custom edvart data type.
     """
-    ret = None
+    if series.empty:
+        return DataType.UNKNOWN
     if is_missing(series):
-        ret = DataType.MISSING
+        return DataType.MISSING
     if is_boolean(series):
-        ret = DataType.BOOLEAN
-    elif is_date(series):
-        ret = DataType.DATE
-    elif is_unique(series):
-        ret = DataType.UNIQUE
-    elif is_categorical(series):
-        ret = DataType.CATEGORICAL
-    elif is_numeric(series):
-        ret = DataType.NUMERIC
-    else:
-        ret = DataType.UNKNOWN
-
-    return str(ret) if string_representation else ret
+        return DataType.BOOLEAN
+    if is_date(series):
+        return DataType.DATE
+    if is_unique(series):
+        return DataType.UNIQUE
+    if is_categorical(series):
+        return DataType.CATEGORICAL
+    if is_numeric(series):
+        return DataType.NUMERIC
+
+    return DataType.UNKNOWN
 
 
 def is_unique(series: pd.Series) -> bool:

@@ -378,10 +378,9 @@ def data_types(df: pd.DataFrame, columns: Optional[List[str]] = None) -> None:
         if columns is not None:
             df = df[columns]
         dtypes = df.apply(
-            func=infer_data_type,
+            func=lambda x_: str(infer_data_type(x_)),
             axis=0,
             result_type="expand",
-            string_representation=True,
         )
 
         # Convert result to frame for viewing

@@ -266,8 +266,8 @@ def univariate_analysis(df: pd.DataFrame, columns: Optional[List[str]] = None) -
                 display(Markdown(f"## *{col} - NULL*"))
                 display(Markdown("The column contains only null values."))
                 continue
-            data_type_name = infer_data_type(df[col], string_representation=True)
             data_type = infer_data_type(df[col])
+            data_type_name = str(data_type)
             display(Markdown(f"## *{col} - {data_type_name}*"))
             if data_type in (DataType.CATEGORICAL, DataType.BOOLEAN):
                 UnivariateAnalysis.top_most_frequent(df[col])
@@ -375,8 +375,8 @@ def add_cells(self, cells: List[Dict[str, Any]]) -> None:
                     display(Markdown(f"## *{col} - NULL*"))
                     display(Markdown("The column contains only null values."))
                     continue
-                data_type_name = infer_data_type(self.df[col], string_representation=True)
                 data_type = infer_data_type(self.df[col])
+                data_type_name = str(data_type)
                 column_header = nbfv4.new_markdown_cell(f"## *{col} - {data_type_name}*")
                 cells.append(column_header)
                 if data_type in (DataType.CATEGORICAL, DataType.BOOLEAN):
@@ -429,8 +429,8 @@ def show(self, df: pd.DataFrame) -> None:
                 display(Markdown(f"## *{col} - NULL*"))
                 display(Markdown("The column contains only null values."))
                 continue
-            data_type_name = infer_data_type(df[col], string_representation=True)
             data_type = infer_data_type(df[col])
+            data_type_name = str(data_type)
             display(Markdown(f"## *{col} - {data_type_name}*"))
             if data_type in (DataType.CATEGORICAL, DataType.BOOLEAN):
                 UnivariateAnalysis.top_most_frequent(df[col])