Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: add option to embed data when exporting report to notebook #206

Merged
merged 10 commits into from
Mar 7, 2024
69 changes: 65 additions & 4 deletions edvart/report.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,11 @@
import base64
import logging
import os
import pickle
import warnings
from abc import ABC
from copy import copy
from enum import Enum, auto
from typing import List, Optional, Tuple, Union

import isort
Expand Down Expand Up @@ -35,6 +37,14 @@ class EmptyReportWarning(UserWarning):
"""Warning raised when a report contains no sections."""


class ExportDataMode(str, Enum):
lukany marked this conversation as resolved.
Show resolved Hide resolved
"""Data export mode for the report."""

NONE = auto()
FILE = auto()
EMBED = auto()


class ReportBase(ABC):
"""
Abstract base class for reports.
Expand All @@ -55,6 +65,8 @@ class ReportBase(ABC):
"import plotly.io as pio",
}

_DEFAULT_LOAD_DATA_CODE = "df = ... # TODO: Fill in code for loading data"

def __init__(
self,
dataframe: pd.DataFrame,
Expand Down Expand Up @@ -84,27 +96,76 @@ def show(self) -> None:
for section in self.sections:
section.show(self.df)

def _export_data(
self, export_data_mode: ExportDataMode, notebook_file_path: Union[str, os.PathLike]
) -> Tuple[str, List[str]]:
"""
Generates code for loading exported data into the exported notebook.

Parameters
----------
export_data_mode : ExportDataMode
The mode of exporting the data.
notebook_file_path : str or PathLike
Filepath of the exported notebook.

-------
Tuple[str, List[str]]
A tuple containing the code for loading the data and a list of imports required for
the code.
"""
if export_data_mode == ExportDataMode.NONE:
return self._DEFAULT_LOAD_DATA_CODE, []
if export_data_mode == ExportDataMode.FILE:
parquet_file_name = str(notebook_file_path).rstrip(".ipynb") + "-data.parquet"
self.df.to_parquet(parquet_file_name)
return f"df = pd.read_parquet('{parquet_file_name}')", ["import pandas as pd"]
assert export_data_mode == ExportDataMode.EMBED
lukany marked this conversation as resolved.
Show resolved Hide resolved
buffer = base64.b85encode(self.df.to_parquet())
return (
code_dedent(
f"""
df_parquet = BytesIO(base64.b85decode({buffer}.decode()))
df = pd.read_parquet(df_parquet)"""
),
["import base64", "import pandas as pd", "from io import BytesIO"],
)

def export_notebook(
self,
notebook_filepath: str,
notebook_filepath: Union[str, os.PathLike],
dataset_name: str = "[INSERT DATASET NAME]",
dataset_description: str = "[INSERT DATASET DESCRIPTION]",
export_data_mode: ExportDataMode = ExportDataMode.NONE,
) -> None:
"""Exports the report as an .ipynb file.

Parameters
----------
notebook_filepath : str
notebook_filepath : str or PathLike
Filepath of the exported notebook.
dataset_name : str (default = "[INSERT DATASET NAME]")
Name of dataset to be used in the title of the report.
dataset_description : str (default = "[INSERT DATASET DESCRIPTION]")
Description of dataset to be used below the title of the report.
export_data_mode : ExportDataMode (default = ExportDataMode.NONE)
Mode for exporting the data to the notebook.
If ExportDataMode.NONE, the data is not exported to the notebook.
If ExportDataMode.FILE, the data is exported to a parquet file
and loaded from there.
If ExportDataMode.EMBED, the data is embedded into the notebook
as a base64 string.
"""
load_data_code, load_data_imports = self._export_data(
export_data_mode, notebook_file_path=notebook_filepath
)
# Generate a notebook containing dataset name and description
self._warn_if_empty()
nb = self._generate_notebook(
dataset_name=dataset_name, dataset_description=dataset_description
dataset_name=dataset_name,
dataset_description=dataset_description,
load_df=load_data_code,
extra_imports=load_data_imports,
)

# Save notebook to file
Expand All @@ -113,9 +174,9 @@ def export_notebook(

def _generate_notebook(
self,
load_df: str,
dataset_name: str = "[INSERT DATASET NAME]",
dataset_description: str = "[INSERT DATASET DESCRIPTION]",
load_df: str = "df = ...",
extra_imports: Optional[List[str]] = None,
show_load_data: bool = True,
) -> nbf.NotebookNode:
Expand Down
38,378 changes: 12 additions & 38,366 deletions examples/report-example.ipynb

Large diffs are not rendered by default.

19 changes: 18 additions & 1 deletion tests/test_report.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,13 @@
import pathlib
import warnings
from contextlib import redirect_stdout

import nbconvert
import nbformat
import numpy as np
import pandas as pd

from edvart.report import DefaultReport, Report
from edvart.report import DefaultReport, ExportDataMode, Report
from edvart.report_sections.bivariate_analysis import BivariateAnalysis
from edvart.report_sections.section_base import Verbosity
from edvart.report_sections.univariate_analysis import UnivariateAnalysis
Expand Down Expand Up @@ -90,3 +93,17 @@ def test_show():
warnings.simplefilter("ignore", UserWarning)
with redirect_stdout(None):
report.show()


def test_exported_notebook_executes(tmp_path: pathlib.Path):
report = Report(dataframe=_get_test_df())

report.add_overview()
for export_data_mode in (ExportDataMode.EMBED, ExportDataMode.FILE):
lukany marked this conversation as resolved.
Show resolved Hide resolved
export_path = tmp_path / "export_{export_data_mode}.ipynb"
report.export_notebook(export_path, export_data_mode=export_data_mode)

notebook = nbformat.read(export_path, as_version=4)
preprocessor = nbconvert.preprocessors.ExecutePreprocessor(timeout=60)

preprocessor.preprocess(notebook)
Loading