feat: Use UNIQUE data type in univariate analysis

datamole-ai · Aug 10, 2023 · 136ebb6 · 136ebb6
1 parent e6f1b8e
commit 136ebb6
Showing 1 changed file with 32 additions and 20 deletions.
diff --git a/edvart/report_sections/univariate_analysis.py b/edvart/report_sections/univariate_analysis.py
@@ -278,6 +278,8 @@ def univariate_analysis(df: pd.DataFrame, columns: Optional[List[str]] = None) -
             if data_type in (DataType.CATEGORICAL, DataType.BOOLEAN):
                 UnivariateAnalysis.top_most_frequent(df[col])
                 UnivariateAnalysis.bar_plot(df[col])
+            elif data_type == DataType.UNIQUE:
+                display(Markdown("Each value in the column is unique."))
             else:
                 UnivariateAnalysis.numeric_statistics(df[col])
                 UnivariateAnalysis.histogram(df[col])
@@ -384,29 +386,37 @@ def add_cells(self, cells: List[Dict[str, Any]]) -> None:
                 column_header = nbfv4.new_markdown_cell(f"## *{col} - {data_type_name}*")
                 cells.append(column_header)
                 if data_type in (DataType.CATEGORICAL, DataType.BOOLEAN):
-                    code = code_dedent(
-                        f"""
-                        top_most_frequent(df['{col}'])
-                        bar_plot(df['{col}'])"""
-                    )
-                elif self.verbosity == 1:
-                    code = code_dedent(
-                        f"""
-                            numeric_statistics(df['{col}'])
-                            histogram(df['{col}'])"""
+                    cell = nbfv4.new_code_cell(
+                        code_dedent(
+                            f"""
+                            top_most_frequent(df['{col}'])
+                            bar_plot(df['{col}'])"""
+                        )
                     )
+                elif data_type == DataType.UNIQUE:
+                    cell = nbfv4.new_markdown_cell("Each value in the column is unique.")
                 else:
-                    code = code_dedent(
-                        f"""
-                            numeric_statistics(
-                                df['{col}'],
-                                descriptive_stats=default_descriptive_statistics(),
-                                quantile_stats=default_quantile_statistics()
+                    if self.verbosity == 1:
+                        cell = nbfv4.new_code_cell(
+                            code_dedent(
+                                f"""
+                                numeric_statistics(df['{col}'])
+                                histogram(df['{col}'])"""
                             )
-                            histogram(df['{col}'])"""
-                    )
-                code_cell = nbfv4.new_code_cell(code)
-                cells.append(code_cell)
+                        )
+                    else:
+                        cell = nbfv4.new_code_cell(
+                            code_dedent(
+                                f"""
+                                numeric_statistics(
+                                    df['{col}'],
+                                    descriptive_stats=default_descriptive_statistics(),
+                                    quantile_stats=default_quantile_statistics()
+                                )
+                                histogram(df['{col}'])"""
+                            )
+                        )
+                cells.append(cell)
 
     def show(self, df: pd.DataFrame) -> None:
         """Generates univariate analysis cell output in the calling notebook.
@@ -431,6 +441,8 @@ def show(self, df: pd.DataFrame) -> None:
             if data_type in (DataType.CATEGORICAL, DataType.BOOLEAN):
                 UnivariateAnalysis.top_most_frequent(df[col])
                 UnivariateAnalysis.bar_plot(df[col])
+            elif data_type == DataType.UNIQUE:
+                display(Markdown("Each value in the column is unique."))
             else:
                 UnivariateAnalysis.numeric_statistics(df[col])
                 UnivariateAnalysis.histogram(df[col])