feat: Add UNIQUE data type (#70)

Resolves #33
datamole-ai · Aug 11, 2023 · c5956ca · c5956ca
1 parent e637aa3
commit c5956ca
Show file tree

Hide file tree

Showing 3 changed files with 63 additions and 20 deletions.
diff --git a/edvart/data_types.py b/edvart/data_types.py
@@ -16,6 +16,7 @@ class DataType(IntEnum):
     DATE = 4
     UNKNOWN = 5
     MISSING = 6
+    UNIQUE = 7
 
     def __str__(self):
         return self.name.lower()
@@ -43,6 +44,8 @@ def infer_data_type(series: pd.Series, string_representation: bool = False) -> U
         ret = DataType.BOOLEAN
     elif is_date(series):
         ret = DataType.DATE
+    elif is_unique(series):
+        ret = DataType.UNIQUE
     elif is_categorical(series):
         ret = DataType.CATEGORICAL
     elif is_numeric(series):
@@ -53,6 +56,22 @@ def infer_data_type(series: pd.Series, string_representation: bool = False) -> U
     return str(ret) if string_representation else ret
 
 
+def is_unique(series: pd.Series) -> bool:
+    """Heuristic to tell if a series is categorical with only unique values.
+
+    Parameters
+    ----------
+    series : pd.Series
+        Series from which to infer data type.
+
+    Returns
+    -------
+    bool
+        Boolean indicating whether series contains only unique values.
+    """
+    return is_categorical(series) and series.nunique() == len(series)
+
+
 def is_numeric(series: pd.Series) -> bool:
     """
     Heuristic to tell if a series contains numbers only.

diff --git a/edvart/report_sections/univariate_analysis.py b/edvart/report_sections/univariate_analysis.py
@@ -272,6 +272,8 @@ def univariate_analysis(df: pd.DataFrame, columns: Optional[List[str]] = None) -
             if data_type in (DataType.CATEGORICAL, DataType.BOOLEAN):
                 UnivariateAnalysis.top_most_frequent(df[col])
                 UnivariateAnalysis.bar_plot(df[col])
+            elif data_type == DataType.UNIQUE:
+                display(Markdown("Each value in the column is unique."))
             else:
                 UnivariateAnalysis.numeric_statistics(df[col])
                 UnivariateAnalysis.histogram(df[col])
@@ -378,29 +380,37 @@ def add_cells(self, cells: List[Dict[str, Any]]) -> None:
                 column_header = nbfv4.new_markdown_cell(f"## *{col} - {data_type_name}*")
                 cells.append(column_header)
                 if data_type in (DataType.CATEGORICAL, DataType.BOOLEAN):
-                    code = code_dedent(
-                        f"""
-                        top_most_frequent(df['{col}'])
-                        bar_plot(df['{col}'])"""
-                    )
-                elif self.verbosity == Verbosity.MEDIUM:
-                    code = code_dedent(
-                        f"""
-                            numeric_statistics(df['{col}'])
-                            histogram(df['{col}'])"""
+                    cell = nbfv4.new_code_cell(
+                        code_dedent(
+                            f"""
+                            top_most_frequent(df['{col}'])
+                            bar_plot(df['{col}'])"""
+                        )
                     )
+                elif data_type == DataType.UNIQUE:
+                    cell = nbfv4.new_markdown_cell("Each value in the column is unique.")
                 else:
-                    code = code_dedent(
-                        f"""
-                            numeric_statistics(
-                                df['{col}'],
-                                descriptive_stats=default_descriptive_statistics(),
-                                quantile_stats=default_quantile_statistics()
+                    if self.verbosity == Verbosity.MEDIUM:
+                        cell = nbfv4.new_code_cell(
+                            code_dedent(
+                                f"""
+                                numeric_statistics(df['{col}'])
+                                histogram(df['{col}'])"""
                             )
-                            histogram(df['{col}'])"""
-                    )
-                code_cell = nbfv4.new_code_cell(code)
-                cells.append(code_cell)
+                        )
+                    else:
+                        cell = nbfv4.new_code_cell(
+                            code_dedent(
+                                f"""
+                                numeric_statistics(
+                                    df['{col}'],
+                                    descriptive_stats=default_descriptive_statistics(),
+                                    quantile_stats=default_quantile_statistics()
+                                )
+                                histogram(df['{col}'])"""
+                            )
+                        )
+                cells.append(cell)
 
     def show(self, df: pd.DataFrame) -> None:
         """Generates univariate analysis cell output in the calling notebook.
@@ -425,6 +435,8 @@ def show(self, df: pd.DataFrame) -> None:
             if data_type in (DataType.CATEGORICAL, DataType.BOOLEAN):
                 UnivariateAnalysis.top_most_frequent(df[col])
                 UnivariateAnalysis.bar_plot(df[col])
+            elif data_type == DataType.UNIQUE:
+                display(Markdown("Each value in the column is unique."))
             else:
                 UnivariateAnalysis.numeric_statistics(df[col])
                 UnivariateAnalysis.histogram(df[col])

diff --git a/tests/test_data_type_inference.py b/tests/test_data_type_inference.py
@@ -24,6 +24,18 @@ def test_inference():
     assert data_types.infer_data_type(
         pd.Series([None, None, np.nan, float("nan")]) == data_types.DataType.MISSING
     ), "Should be missing"
+    assert (
+        data_types.infer_data_type(pd.Series(list(range(10)))) == data_types.DataType.UNIQUE
+    ), "Should be unique"
+    assert (
+        data_types.infer_data_type(pd.Series([1] + list(range(100)))) == data_types.DataType.NUMERIC
+    ), "Should be numeric"
+    assert (
+        data_types.infer_data_type(pd.Series()) == data_types.DataType.UNKNOWN
+    ), "Should be unknown"
+    assert data_types.infer_data_type(
+        pd.Series([True, False]) == data_types.DataType.BOOLEAN
+    ), "Should be boolean"
 
 
 def test_missing_series():