feat!: filter and check columns in section main functions

Previously columns were computed in `__init__`s of the respective subsections of multivariate analysis. This required that the DataFrame, on which the analysis is performed, is specified in the constructor, which prevented creating the section without the DataFrame. This change makes #36 much easier and enables removal of the `add_<section>` methods.
datamole-ai · Sep 20, 2023 · 9d7f09d · 9d7f09d
2 parents a7fafbb + dcec60e
commit 9d7f09d
Show file tree

Hide file tree

Showing 25 changed files with 203 additions and 126 deletions.
diff --git a/README.md b/README.md
@@ -53,7 +53,7 @@ pip install edvart
 See the notebook
 [examples/report-example.ipynb](https://nbviewer.org/github/datamole-ai/edvart/blob/main/examples/report-example.ipynb)
 for an example report on a tabular dataset or
-[examples/time-series-report-time-series-report-example.ipynb](https://nbviewer.org/github/datamole-ai/edvart/blob/main/examples/time-series-report-example.ipynb)
+[examples/time-series-report-example.ipynb](https://nbviewer.org/github/datamole-ai/edvart/blob/main/examples/time-series-report-example.ipynb)
 for an example report on a time-series dataset.
 
 See the [Usage section](https://datamole-ai.github.io/edvart/usage.html) of the documentation

diff --git a/edvart/report.py b/edvart/report.py
@@ -145,7 +145,7 @@ def _generate_notebook(
         if self._table_of_contents is not None:
             self._table_of_contents.add_cells(self.sections, nb["cells"])
         for section in self.sections:
-            section.add_cells(nb["cells"])
+            section.add_cells(cells=nb["cells"], df=self.df)
 
         return nb
 

diff --git a/edvart/report_sections/bivariate_analysis.py b/edvart/report_sections/bivariate_analysis.py
@@ -212,7 +212,7 @@ def bivariate_analysis(
         for sub in bivariate_analysis.subsections:
             sub.show(df)
 
-    def add_cells(self, cells: List[Dict[str, Any]]) -> None:
+    def add_cells(self, cells: List[Dict[str, Any]], df: pd.DataFrame) -> None:
         """Adds cells to the list of cells.
 
         Cells can be either code cells or markdown cells.
@@ -221,6 +221,8 @@ def add_cells(self, cells: List[Dict[str, Any]]) -> None:
         ----------
         cells : List[Dict[str, Any]]
             List of generated notebook cells which are represented as dictionaries
+        df: pd.DataFrame
+            Data for which to add the cells
         """
         section_header = nbfv4.new_markdown_cell(self.get_title(section_level=1))
         cells.append(section_header)
@@ -251,9 +253,9 @@ def add_cells(self, cells: List[Dict[str, Any]]) -> None:
             cells.append(nbfv4.new_code_cell(code))
             for sub in self.subsections:
                 if sub.verbosity > Verbosity.LOW:
-                    sub.add_cells(cells)
+                    sub.add_cells(cells=cells, df=df)
         else:
-            super().add_cells(cells)
+            super().add_cells(cells=cells, df=df)
 
     def required_imports(self) -> List[str]:
         """Returns a list of imports to be put at the top of a generated notebook.
@@ -518,13 +520,15 @@ def required_imports(self) -> List[str]:
             "from edvart.data_types import is_numeric",
         ]
 
-    def add_cells(self, cells: List[Dict[str, Any]]) -> None:
+    def add_cells(self, cells: List[Dict[str, Any]], df: pd.DataFrame) -> None:
         """Adds cells to the list of cells. Cells can be either code cells or markdown cells.
 
         Parameters
         ----------
         cells : List[Dict[str, Any]]
-            List of generated notebook cells which are represented as dictionaries.
+            List of generated notebook cells which are represented as dictionaries
+        df: pd.DataFrame
+            Data for which to add the cells.
         """
         section_header = nbfv4.new_markdown_cell(self.get_title(section_level=2))
         cells.append(section_header)
@@ -697,13 +701,15 @@ def required_imports(self) -> List[str]:
             "import seaborn as sns",
         ]
 
-    def add_cells(self, cells: List[Dict[str, Any]]) -> None:
+    def add_cells(self, cells: List[Dict[str, Any]], df: pd.DataFrame) -> None:
         """Adds cells to the list of cells. Cells can be either code cells or markdown cells.
 
         Parameters
         ----------
         cells : List[Dict[str, Any]]
-            List of generated notebook cells which are represented as dictionaries.
+            List of generated notebook cells which are represented as dictionaries
+        df: pd.DataFrame
+            Data for which to add the cells.
         """
         section_header = nbfv4.new_markdown_cell(self.get_title(section_level=2))
         cells.append(section_header)
@@ -958,13 +964,15 @@ def required_imports(self) -> List[str]:
             "import matplotlib.pyplot as plt",
         ]
 
-    def add_cells(self, cells: List[Dict[str, Any]]) -> None:
+    def add_cells(self, cells: List[Dict[str, Any]], df: pd.DataFrame) -> None:
         """Adds cells to the list of cells. Cells can be either code cells or markdown cells.
 
         Parameters
         ----------
         cells : List[Dict[str, Any]]
-            List of generated notebook cells which are represented as dictionaries.
+            List of generated notebook cells which are represented as dictionaries
+        df: pd.DataFrame
+            Data for which to add the cells.
         """
         section_header = nbfv4.new_markdown_cell(self.get_title(section_level=2))
         cells.append(section_header)

diff --git a/edvart/report_sections/dataset_overview.py b/edvart/report_sections/dataset_overview.py
@@ -12,7 +12,9 @@
     is_boolean,
     is_categorical,
     is_date,
+    is_missing,
     is_numeric,
+    is_unique,
 )
 from edvart.pandas_formatting import hide_index, render_dictionary, series_to_frame
 from edvart.report_sections.code_string_formatting import get_code, total_dedent
@@ -177,15 +179,17 @@ def required_imports(self) -> List[str]:
             return list(imports)
         return super().required_imports()
 
-    def add_cells(self, cells: List[Dict[str, Any]]) -> None:
+    def add_cells(self, cells: List[Dict[str, Any]], df: pd.DataFrame) -> None:
         """Adds cells to the list of cells.
 
         Cells can be either code cells or markdown cells.
 
         Parameters
         ----------
         cells : List[Dict[str, Any]]
-            List of generated notebook cells which are represented as dictionaries.
+            List of generated notebook cells which are represented as dictionaries
+        df: pd.DataFrame
+            Data for which to add the cells.
         """
         section_header = nbfv4.new_markdown_cell(self.get_title(section_level=1))
         cells.append(section_header)
@@ -209,9 +213,9 @@ def add_cells(self, cells: List[Dict[str, Any]]) -> None:
             cells.append(nbfv4.new_code_cell(code))
             for subsec in self.subsections:
                 if subsec.verbosity > Verbosity.LOW:
-                    subsec.add_cells(cells)
+                    subsec.add_cells(cells, df=df)
         else:
-            super().add_cells(cells)
+            super().add_cells(cells, df=df)
 
     def show(self, df: pd.DataFrame) -> None:
         """Generates cell output of this section in the calling notebook.
@@ -302,13 +306,15 @@ def required_imports(self) -> List[str]:
             ]
         return []
 
-    def add_cells(self, cells: List[Dict[str, Any]]) -> None:
+    def add_cells(self, cells: List[Dict[str, Any]], df: pd.DataFrame) -> None:
         """Adds cells to the list of cells.
 
         Parameters
         ----------
         cells : List[Dict[str, Any]]
-            List of generated notebook cells which are represented as dictionaries.
+            List of generated notebook cells which are represented as dictionaries
+        df: pd.DataFrame
+            Data for which to add the cells.
         """
         section_header = nbfv4.new_markdown_cell(self.get_title(section_level=2))
         cells.append(section_header)
@@ -410,13 +416,15 @@ def required_imports(self) -> List[str]:
             "from IPython.display import display",
         ]
 
-    def add_cells(self, cells: List[Dict[str, Any]]) -> None:
+    def add_cells(self, cells: List[Dict[str, Any]], df: pd.DataFrame) -> None:
         """Adds data type inference cells to the list of cells.
 
         Parameters
         ----------
         cells : List[Dict[str, Any]]
-            List of generated notebook cells which are represented as dictionaries.
+            List of generated notebook cells which are represented as dictionaries
+        df: pd.DataFrame
+            Data for which to add the cells.
         """
         section_header = nbfv4.new_markdown_cell(self.get_title(section_level=2))
         cells.append(section_header)
@@ -429,24 +437,25 @@ def add_cells(self, cells: List[Dict[str, Any]]) -> None:
         if self.verbosity <= Verbosity.MEDIUM:
             code = default_call
         elif self.verbosity == Verbosity.HIGH:
-            code = (
-                get_code(series_to_frame)
-                + 2 * "\n"
-                + get_code(DataType)
-                + 2 * "\n"
-                + get_code(is_numeric)
-                + 2 * "\n"
-                + get_code(is_categorical)
-                + 2 * "\n"
-                + get_code(is_boolean)
-                + 2 * "\n"
-                + get_code(is_date)
-                + 2 * "\n"
-                + get_code(infer_data_type)
-                + 2 * "\n"
-                + get_code(DataTypes.data_types)
-                + 2 * "\n"
-                + default_call
+            code = "\n\n".join(
+                (
+                    *(
+                        get_code(function_or_class)
+                        for function_or_class in (
+                            series_to_frame,
+                            DataType,
+                            is_unique,
+                            is_numeric,
+                            is_missing,
+                            is_categorical,
+                            is_boolean,
+                            is_date,
+                            infer_data_type,
+                            DataTypes.data_types,
+                        )
+                    ),
+                    default_call,
+                )
             )
 
         cells.append(nbfv4.new_code_cell(code))
@@ -537,13 +546,15 @@ def required_imports(self) -> List[str]:
             "from IPython.display import Markdown",
         ]
 
-    def add_cells(self, cells: List[Dict[str, Any]]) -> None:
+    def add_cells(self, cells: List[Dict[str, Any]], df: pd.DataFrame) -> None:
         """Adds dataframe preview cells to the list of cells.
 
         Parameters
         ----------
         cells : List[Dict[str, Any]]
-            List of generated notebook cells which are represented as dictionaries.
+            List of generated notebook cells which are represented as dictionaries
+        df: pd.DataFrame
+            Data for which to add the cells.
         """
         section_header = nbfv4.new_markdown_cell(self.get_title(section_level=2))
         cells.append(section_header)
@@ -691,13 +702,15 @@ def required_imports(self) -> List[str]:
             "import matplotlib.pyplot as plt",
         ]
 
-    def add_cells(self, cells: List[Dict[str, Any]]) -> None:
+    def add_cells(self, cells: List[Dict[str, Any]], df: pd.DataFrame) -> None:
         """Adds code cells which calculate missing values percentage table to the list of cells.
 
         Parameters
         ----------
         cells : List[Dict[str, Any]]
-            List of generated notebook cells which are represented as dictionaries.
+            List of generated notebook cells which are represented as dictionaries
+        df: pd.DataFrame
+            Data for which to add the cells.
         """
         section_header = nbfv4.new_markdown_cell(self.get_title(section_level=2))
         cells.append(section_header)
@@ -822,13 +835,15 @@ def required_imports(self) -> List[str]:
             ]
         return base_imports + ["from IPython.display import display"]
 
-    def add_cells(self, cells: List[Dict[str, Any]]) -> None:
+    def add_cells(self, cells: List[Dict[str, Any]], df: pd.DataFrame) -> None:
         """Adds code cells which calculate constant occurrence table to the list of cells.
 
         Parameters
         ----------
         cells : List[Dict[str, Any]]
-            List of generated notebook cells which are represented as dictionaries.
+            List of generated notebook cells which are represented as dictionaries
+        df: pd.DataFrame
+            Data for which to add the cells.
         """
         section_header = nbfv4.new_markdown_cell(self.get_title(section_level=2))
         cells.append(section_header)
@@ -931,13 +946,15 @@ def required_imports(self) -> List[str]:
             ]
         return ["from IPython.display import display"]
 
-    def add_cells(self, cells: List[Dict[str, Any]]) -> None:
+    def add_cells(self, cells: List[Dict[str, Any]], df: pd.DataFrame) -> None:
         """Adds code cells which count the number of rows with missing value to the list of cells.
 
         Parameters
         ----------
         cells : List[Dict[str, Any]]
-            List of generated notebook cells which are represented as dictionaries.
+            List of generated notebook cells which are represented as dictionaries
+        df: pd.DataFrame
+            Data for which to add the cells.
         """
         section_header = nbfv4.new_markdown_cell(self.get_title(section_level=2))
         cells.append(section_header)
@@ -1040,13 +1057,15 @@ def required_imports(self) -> List[str]:
             ]
         return ["from IPython.display import display"]
 
-    def add_cells(self, cells: List[Dict[str, Any]]) -> None:
+    def add_cells(self, cells: List[Dict[str, Any]], df: pd.DataFrame) -> None:
         """Adds code cells which count the number of duplicated rows to the list of cells.
 
         Parameters
         ----------
         cells : List[Dict[str, Any]]
-            List of generated notebook cells which are represented as dictionaries.
+            List of generated notebook cells which are represented as dictionaries
+        df: pd.DataFrame
+            Data for which to add the cells.
         """
         section_header = nbfv4.new_markdown_cell(self.get_title(section_level=2))
         cells.append(section_header)

diff --git a/edvart/report_sections/group_analysis.py b/edvart/report_sections/group_analysis.py
@@ -593,7 +593,9 @@ def _add_function_defs(self, cells: List[Dict[str, Any]]):
         Parameters
         ----------
         cells : List[Dict[str, Any]]
-            List of generated notebook cells which are represented as dictionaries.
+            List of generated notebook cells which are represented as dictionaries
+        df: pd.DataFrame
+            Data for which to add the cells.
         """
         code = (
             get_code(GroupAnalysis.default_group_quantile_stats)
@@ -621,7 +623,9 @@ def _add_cells_numeric_col(self, cells: List[Dict[str, Any]], column_name: str):
         Parameters
         ----------
         cells : List[Dict[str, Any]]
-            List of generated notebook cells which are represented as dictionaries.
+            List of generated notebook cells which are represented as dictionaries
+        df: pd.DataFrame
+            Data for which to add the cells.
         column_name : str
             Name of column for which to generate code.
         """
@@ -654,15 +658,17 @@ def _add_cells_numeric_col(self, cells: List[Dict[str, Any]], column_name: str):
             code += f"overlaid_histograms(df=df, groupby={self.groupby}, column='{column_name}')"
         cells.append(nbfv4.new_code_cell(code))
 
-    def add_cells(self, cells: List[Dict[str, Any]]):
+    def add_cells(self, cells: List[Dict[str, Any]], df: pd.DataFrame) -> None:
         """Add cells to the list of cells.
 
         Cells can be either code cells or markdown cells.
 
         Parameters
         ----------
         cells : List[Dict[str, Any]]
-            List of generated notebook cells which are represented as dictionaries.
+            List of generated notebook cells which are represented as dictionaries
+        df: pd.DataFrame
+            Data for which to add the cells.
         """
         section_header = nbfv4.new_markdown_cell(self.get_title(section_level=1))
         cells.append(section_header)