Skip to content

Commit

Permalink
feat!: filter and check columns in section main functions
Browse files Browse the repository at this point in the history
Previously columns were computed in `__init__`s of the respective subsections of multivariate analysis.
This required that the DataFrame, on which the analysis is performed, is specified in the constructor,
which prevented creating the section without the DataFrame.
This change makes #36 much easier and enables removal
of the `add_<section>` methods.
  • Loading branch information
mbelak-dtml committed Sep 20, 2023
2 parents a7fafbb + dcec60e commit 9d7f09d
Show file tree
Hide file tree
Showing 25 changed files with 203 additions and 126 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ pip install edvart
See the notebook
[examples/report-example.ipynb](https://nbviewer.org/github/datamole-ai/edvart/blob/main/examples/report-example.ipynb)
for an example report on a tabular dataset or
[examples/time-series-report-time-series-report-example.ipynb](https://nbviewer.org/github/datamole-ai/edvart/blob/main/examples/time-series-report-example.ipynb)
[examples/time-series-report-example.ipynb](https://nbviewer.org/github/datamole-ai/edvart/blob/main/examples/time-series-report-example.ipynb)
for an example report on a time-series dataset.

See the [Usage section](https://datamole-ai.github.io/edvart/usage.html) of the documentation
Expand Down
2 changes: 1 addition & 1 deletion edvart/report.py
Original file line number Diff line number Diff line change
Expand Up @@ -145,7 +145,7 @@ def _generate_notebook(
if self._table_of_contents is not None:
self._table_of_contents.add_cells(self.sections, nb["cells"])
for section in self.sections:
section.add_cells(nb["cells"])
section.add_cells(cells=nb["cells"], df=self.df)

return nb

Expand Down
26 changes: 17 additions & 9 deletions edvart/report_sections/bivariate_analysis.py
Original file line number Diff line number Diff line change
Expand Up @@ -212,7 +212,7 @@ def bivariate_analysis(
for sub in bivariate_analysis.subsections:
sub.show(df)

def add_cells(self, cells: List[Dict[str, Any]]) -> None:
def add_cells(self, cells: List[Dict[str, Any]], df: pd.DataFrame) -> None:
"""Adds cells to the list of cells.
Cells can be either code cells or markdown cells.
Expand All @@ -221,6 +221,8 @@ def add_cells(self, cells: List[Dict[str, Any]]) -> None:
----------
cells : List[Dict[str, Any]]
List of generated notebook cells which are represented as dictionaries
df: pd.DataFrame
Data for which to add the cells
"""
section_header = nbfv4.new_markdown_cell(self.get_title(section_level=1))
cells.append(section_header)
Expand Down Expand Up @@ -251,9 +253,9 @@ def add_cells(self, cells: List[Dict[str, Any]]) -> None:
cells.append(nbfv4.new_code_cell(code))
for sub in self.subsections:
if sub.verbosity > Verbosity.LOW:
sub.add_cells(cells)
sub.add_cells(cells=cells, df=df)
else:
super().add_cells(cells)
super().add_cells(cells=cells, df=df)

def required_imports(self) -> List[str]:
"""Returns a list of imports to be put at the top of a generated notebook.
Expand Down Expand Up @@ -518,13 +520,15 @@ def required_imports(self) -> List[str]:
"from edvart.data_types import is_numeric",
]

def add_cells(self, cells: List[Dict[str, Any]]) -> None:
def add_cells(self, cells: List[Dict[str, Any]], df: pd.DataFrame) -> None:
"""Adds cells to the list of cells. Cells can be either code cells or markdown cells.
Parameters
----------
cells : List[Dict[str, Any]]
List of generated notebook cells which are represented as dictionaries.
List of generated notebook cells which are represented as dictionaries
df: pd.DataFrame
Data for which to add the cells.
"""
section_header = nbfv4.new_markdown_cell(self.get_title(section_level=2))
cells.append(section_header)
Expand Down Expand Up @@ -697,13 +701,15 @@ def required_imports(self) -> List[str]:
"import seaborn as sns",
]

def add_cells(self, cells: List[Dict[str, Any]]) -> None:
def add_cells(self, cells: List[Dict[str, Any]], df: pd.DataFrame) -> None:
"""Adds cells to the list of cells. Cells can be either code cells or markdown cells.
Parameters
----------
cells : List[Dict[str, Any]]
List of generated notebook cells which are represented as dictionaries.
List of generated notebook cells which are represented as dictionaries
df: pd.DataFrame
Data for which to add the cells.
"""
section_header = nbfv4.new_markdown_cell(self.get_title(section_level=2))
cells.append(section_header)
Expand Down Expand Up @@ -958,13 +964,15 @@ def required_imports(self) -> List[str]:
"import matplotlib.pyplot as plt",
]

def add_cells(self, cells: List[Dict[str, Any]]) -> None:
def add_cells(self, cells: List[Dict[str, Any]], df: pd.DataFrame) -> None:
"""Adds cells to the list of cells. Cells can be either code cells or markdown cells.
Parameters
----------
cells : List[Dict[str, Any]]
List of generated notebook cells which are represented as dictionaries.
List of generated notebook cells which are represented as dictionaries
df: pd.DataFrame
Data for which to add the cells.
"""
section_header = nbfv4.new_markdown_cell(self.get_title(section_level=2))
cells.append(section_header)
Expand Down
91 changes: 55 additions & 36 deletions edvart/report_sections/dataset_overview.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,9 @@
is_boolean,
is_categorical,
is_date,
is_missing,
is_numeric,
is_unique,
)
from edvart.pandas_formatting import hide_index, render_dictionary, series_to_frame
from edvart.report_sections.code_string_formatting import get_code, total_dedent
Expand Down Expand Up @@ -177,15 +179,17 @@ def required_imports(self) -> List[str]:
return list(imports)
return super().required_imports()

def add_cells(self, cells: List[Dict[str, Any]]) -> None:
def add_cells(self, cells: List[Dict[str, Any]], df: pd.DataFrame) -> None:
"""Adds cells to the list of cells.
Cells can be either code cells or markdown cells.
Parameters
----------
cells : List[Dict[str, Any]]
List of generated notebook cells which are represented as dictionaries.
List of generated notebook cells which are represented as dictionaries
df: pd.DataFrame
Data for which to add the cells.
"""
section_header = nbfv4.new_markdown_cell(self.get_title(section_level=1))
cells.append(section_header)
Expand All @@ -209,9 +213,9 @@ def add_cells(self, cells: List[Dict[str, Any]]) -> None:
cells.append(nbfv4.new_code_cell(code))
for subsec in self.subsections:
if subsec.verbosity > Verbosity.LOW:
subsec.add_cells(cells)
subsec.add_cells(cells, df=df)
else:
super().add_cells(cells)
super().add_cells(cells, df=df)

def show(self, df: pd.DataFrame) -> None:
"""Generates cell output of this section in the calling notebook.
Expand Down Expand Up @@ -302,13 +306,15 @@ def required_imports(self) -> List[str]:
]
return []

def add_cells(self, cells: List[Dict[str, Any]]) -> None:
def add_cells(self, cells: List[Dict[str, Any]], df: pd.DataFrame) -> None:
"""Adds cells to the list of cells.
Parameters
----------
cells : List[Dict[str, Any]]
List of generated notebook cells which are represented as dictionaries.
List of generated notebook cells which are represented as dictionaries
df: pd.DataFrame
Data for which to add the cells.
"""
section_header = nbfv4.new_markdown_cell(self.get_title(section_level=2))
cells.append(section_header)
Expand Down Expand Up @@ -410,13 +416,15 @@ def required_imports(self) -> List[str]:
"from IPython.display import display",
]

def add_cells(self, cells: List[Dict[str, Any]]) -> None:
def add_cells(self, cells: List[Dict[str, Any]], df: pd.DataFrame) -> None:
"""Adds data type inference cells to the list of cells.
Parameters
----------
cells : List[Dict[str, Any]]
List of generated notebook cells which are represented as dictionaries.
List of generated notebook cells which are represented as dictionaries
df: pd.DataFrame
Data for which to add the cells.
"""
section_header = nbfv4.new_markdown_cell(self.get_title(section_level=2))
cells.append(section_header)
Expand All @@ -429,24 +437,25 @@ def add_cells(self, cells: List[Dict[str, Any]]) -> None:
if self.verbosity <= Verbosity.MEDIUM:
code = default_call
elif self.verbosity == Verbosity.HIGH:
code = (
get_code(series_to_frame)
+ 2 * "\n"
+ get_code(DataType)
+ 2 * "\n"
+ get_code(is_numeric)
+ 2 * "\n"
+ get_code(is_categorical)
+ 2 * "\n"
+ get_code(is_boolean)
+ 2 * "\n"
+ get_code(is_date)
+ 2 * "\n"
+ get_code(infer_data_type)
+ 2 * "\n"
+ get_code(DataTypes.data_types)
+ 2 * "\n"
+ default_call
code = "\n\n".join(
(
*(
get_code(function_or_class)
for function_or_class in (
series_to_frame,
DataType,
is_unique,
is_numeric,
is_missing,
is_categorical,
is_boolean,
is_date,
infer_data_type,
DataTypes.data_types,
)
),
default_call,
)
)

cells.append(nbfv4.new_code_cell(code))
Expand Down Expand Up @@ -537,13 +546,15 @@ def required_imports(self) -> List[str]:
"from IPython.display import Markdown",
]

def add_cells(self, cells: List[Dict[str, Any]]) -> None:
def add_cells(self, cells: List[Dict[str, Any]], df: pd.DataFrame) -> None:
"""Adds dataframe preview cells to the list of cells.
Parameters
----------
cells : List[Dict[str, Any]]
List of generated notebook cells which are represented as dictionaries.
List of generated notebook cells which are represented as dictionaries
df: pd.DataFrame
Data for which to add the cells.
"""
section_header = nbfv4.new_markdown_cell(self.get_title(section_level=2))
cells.append(section_header)
Expand Down Expand Up @@ -691,13 +702,15 @@ def required_imports(self) -> List[str]:
"import matplotlib.pyplot as plt",
]

def add_cells(self, cells: List[Dict[str, Any]]) -> None:
def add_cells(self, cells: List[Dict[str, Any]], df: pd.DataFrame) -> None:
"""Adds code cells which calculate missing values percentage table to the list of cells.
Parameters
----------
cells : List[Dict[str, Any]]
List of generated notebook cells which are represented as dictionaries.
List of generated notebook cells which are represented as dictionaries
df: pd.DataFrame
Data for which to add the cells.
"""
section_header = nbfv4.new_markdown_cell(self.get_title(section_level=2))
cells.append(section_header)
Expand Down Expand Up @@ -822,13 +835,15 @@ def required_imports(self) -> List[str]:
]
return base_imports + ["from IPython.display import display"]

def add_cells(self, cells: List[Dict[str, Any]]) -> None:
def add_cells(self, cells: List[Dict[str, Any]], df: pd.DataFrame) -> None:
"""Adds code cells which calculate constant occurrence table to the list of cells.
Parameters
----------
cells : List[Dict[str, Any]]
List of generated notebook cells which are represented as dictionaries.
List of generated notebook cells which are represented as dictionaries
df: pd.DataFrame
Data for which to add the cells.
"""
section_header = nbfv4.new_markdown_cell(self.get_title(section_level=2))
cells.append(section_header)
Expand Down Expand Up @@ -931,13 +946,15 @@ def required_imports(self) -> List[str]:
]
return ["from IPython.display import display"]

def add_cells(self, cells: List[Dict[str, Any]]) -> None:
def add_cells(self, cells: List[Dict[str, Any]], df: pd.DataFrame) -> None:
"""Adds code cells which count the number of rows with missing value to the list of cells.
Parameters
----------
cells : List[Dict[str, Any]]
List of generated notebook cells which are represented as dictionaries.
List of generated notebook cells which are represented as dictionaries
df: pd.DataFrame
Data for which to add the cells.
"""
section_header = nbfv4.new_markdown_cell(self.get_title(section_level=2))
cells.append(section_header)
Expand Down Expand Up @@ -1040,13 +1057,15 @@ def required_imports(self) -> List[str]:
]
return ["from IPython.display import display"]

def add_cells(self, cells: List[Dict[str, Any]]) -> None:
def add_cells(self, cells: List[Dict[str, Any]], df: pd.DataFrame) -> None:
"""Adds code cells which count the number of duplicated rows to the list of cells.
Parameters
----------
cells : List[Dict[str, Any]]
List of generated notebook cells which are represented as dictionaries.
List of generated notebook cells which are represented as dictionaries
df: pd.DataFrame
Data for which to add the cells.
"""
section_header = nbfv4.new_markdown_cell(self.get_title(section_level=2))
cells.append(section_header)
Expand Down
14 changes: 10 additions & 4 deletions edvart/report_sections/group_analysis.py
Original file line number Diff line number Diff line change
Expand Up @@ -593,7 +593,9 @@ def _add_function_defs(self, cells: List[Dict[str, Any]]):
Parameters
----------
cells : List[Dict[str, Any]]
List of generated notebook cells which are represented as dictionaries.
List of generated notebook cells which are represented as dictionaries
df: pd.DataFrame
Data for which to add the cells.
"""
code = (
get_code(GroupAnalysis.default_group_quantile_stats)
Expand Down Expand Up @@ -621,7 +623,9 @@ def _add_cells_numeric_col(self, cells: List[Dict[str, Any]], column_name: str):
Parameters
----------
cells : List[Dict[str, Any]]
List of generated notebook cells which are represented as dictionaries.
List of generated notebook cells which are represented as dictionaries
df: pd.DataFrame
Data for which to add the cells.
column_name : str
Name of column for which to generate code.
"""
Expand Down Expand Up @@ -654,15 +658,17 @@ def _add_cells_numeric_col(self, cells: List[Dict[str, Any]], column_name: str):
code += f"overlaid_histograms(df=df, groupby={self.groupby}, column='{column_name}')"
cells.append(nbfv4.new_code_cell(code))

def add_cells(self, cells: List[Dict[str, Any]]):
def add_cells(self, cells: List[Dict[str, Any]], df: pd.DataFrame) -> None:
"""Add cells to the list of cells.
Cells can be either code cells or markdown cells.
Parameters
----------
cells : List[Dict[str, Any]]
List of generated notebook cells which are represented as dictionaries.
List of generated notebook cells which are represented as dictionaries
df: pd.DataFrame
Data for which to add the cells.
"""
section_header = nbfv4.new_markdown_cell(self.get_title(section_level=1))
cells.append(section_header)
Expand Down
Loading

0 comments on commit 9d7f09d

Please sign in to comment.