From d664e6c576fac1e18c13e56c04ea54fad6912ab4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Michal=20Bel=C3=A1k?= <97168298+mbelak-dtml@users.noreply.github.com> Date: Mon, 7 Aug 2023 13:13:17 +0200 Subject: [PATCH] feat!: Refactor `Report` into `Report` and `DefaultReport`. (#47) Resolves #45 --- api-example.md | 4 +- docs/advanced.rst | 24 ++-- docs/getting_started.rst | 6 +- edvart/__init__.py | 1 + edvart/report.py | 254 ++++++++++++++++++++++----------------- tests/test_report.py | 54 +++++++-- 6 files changed, 207 insertions(+), 136 deletions(-) diff --git a/api-example.md b/api-example.md index 6948491..e904ffc 100644 --- a/api-example.md +++ b/api-example.md @@ -35,7 +35,7 @@ dataset.head() ``` ```python -report = edvart.Report( +report = edvart.DefaultReport( dataset, verbosity=0, columns_overview=['Name', 'Survived'], @@ -69,7 +69,7 @@ dataset_ts = edvart.example_datasets.dataset_global_temp() ``` ```python -report_ts = edvart.TimeseriesReport( +report_ts = edvart.DefaultTimeseriesReport( dataset_ts, # Monthly data -> analyze yearly seasonality sampling_rate=12, diff --git a/docs/advanced.rst b/docs/advanced.rst index 90aa44f..1a351bc 100644 --- a/docs/advanced.rst +++ b/docs/advanced.rst @@ -10,7 +10,10 @@ Report class ------------ The most important class of the package :py:class:`~edvart.report.Report`. -This class allows you to specify :ref:`verbosity ` or specify which columns should be used. +The report consists of sections, which can be added via methods of the `Report` class. +The report is empty by default. +The class :py:class:`~edvart.report.DefaultReport` is a subclass of `Report`, +which contains a default set of sections. With created instance of `Report` you can: @@ -114,17 +117,18 @@ or modifying sections settings. Selection of sections ~~~~~~~~~~~~~~~~~~~~~ - -If you want to use only a subset of sections you have to set -`use_default_sections` parameter of report to `False` and then you can add your own sections. +You can add sections using methods `add_*` of the `Report` class. .. code-block:: python - # Shows only univariate analysis + # Shows only univariate and bivariate analysis import edvart df = edvart.example_datasets.dataset_titanic() - report = edvart.Report(df, use_default_sections=False) - report.add_univariate_analysis() + report = ( + edvart.Report(df) + .add_univariate_analysis() + .add_bivariate_analysis() + ) Sections configuration @@ -141,7 +145,7 @@ Or you can set section verbosity (described later). import edvart df = edvart.example_datasets.dataset_titanic() - report = edvart.Report(df, columns_overview=["Name", "Survived"], use_default_sections=False) + report = edvart.Report(df) report.add_overview(omit_columns=["PassengerId"]).add_univariate_analysis( use_columns=["Name", "Sex", "Age"] @@ -176,7 +180,7 @@ Examples: import edvart df = edvart.example_datasets.dataset_titanic() - edvart.Report(df, verbosity=1).export_notebook("test-export.ipynb") + edvart.DefaultReport(df, verbosity=1).export_notebook("test-export.ipynb") .. code-block:: python @@ -185,4 +189,4 @@ Examples: import edvart df = edvart.example_datasets.dataset_titanic() - edvart.Report(df, verbosity=1, verbosity_univariate_analysis=2).export_notebook("test-export.ipynb") + edvart.DefaultReport(df, verbosity=1, verbosity_univariate_analysis=2).export_notebook("test-export.ipynb") diff --git a/docs/getting_started.rst b/docs/getting_started.rst index 7d2040c..352fa81 100644 --- a/docs/getting_started.rst +++ b/docs/getting_started.rst @@ -7,7 +7,7 @@ Getting started import edvart df = edvart.example_datasets.dataset_titanic() - edvart.Report(df).show() + edvart.DefaultReport(df).show() 2. Generate report notebook @@ -15,7 +15,7 @@ Getting started import edvart df = edvart.example_datasets.dataset_titanic() - report = edvart.Report(df) + report = edvart.DefaultReport(df) report.export_notebook("titanic_report.ipynb") You can modify the generated notebook if you want to modify some settings. @@ -28,7 +28,7 @@ For more advanced usage of edvart, please read the documentation section import edvart df = edvart.example_datasets.dataset_titanic() - report = edvart.Report(df) + report = edvart.DefaultReport(df) report.export_html( html_filepath="titanic_report.html", dataset_name="Titanic", diff --git a/edvart/__init__.py b/edvart/__init__.py index 572e2f6..b0123f7 100644 --- a/edvart/__init__.py +++ b/edvart/__init__.py @@ -4,6 +4,7 @@ from importlib.metadata import PackageNotFoundError, version from edvart import example_datasets +from edvart.report import DefaultReport, DefaultTimeseriesReport from edvart.report import Report from edvart.report import Report as create_report from edvart.report import TimeseriesReport diff --git a/edvart/report.py b/edvart/report.py index e46ca5f..dd86952 100755 --- a/edvart/report.py +++ b/edvart/report.py @@ -34,8 +34,6 @@ class ReportBase(ABC): ---------- dataframe : pd.DataFrame Data from which to generate the report. - use_default_sections : bool (default = True) - Whether add the report's default sections to the report. verbosity : int (default = 0) The default verbosity for the exported code of the entire report, has to be one of [0, 1, 2], by default 0. @@ -49,12 +47,10 @@ class ReportBase(ABC): def __init__( self, dataframe: pd.DataFrame, - use_default_sections: bool = True, verbosity: int = 0, ): self._class_logger = logging.getLogger(__name__).getChild(self.__class__.__name__) self.df = dataframe - self.use_default_sections = use_default_sections self.sections = [] # Check for global verbosity validity if verbosity not in [0, 1, 2]: @@ -653,21 +649,39 @@ def add_table_of_contents(self, include_subsections: bool = True) -> "ReportBase class Report(ReportBase): - """This class instantiates an object that the edvart user should mainly use for report - configuration and export. + """ + A report for tabular datasets. Contains no sections by default. + + See `DefaultReport` for a report with default sections. + See methods `add_*` for adding sections to the report. + + Parameters + ---------- + dataframe : pd.DataFrame + Data from which to generate the report. + verbosity : int (default = 0) + Verbosity of the exported code of the entire report, has to be one of + [0, 1, 2], by default 0. + """ + + def __init__(self, dataframe: pd.DataFrame, verbosity: int = 0): + super().__init__(dataframe=dataframe, verbosity=verbosity) + - This class is intended for creating general-purpose reports. - For creating a report for time-series data, please see `TimeseriesReport`. +class DefaultReport(Report): + """A report for tabular data containing default sections. + + The report contains the following sections: + - dataset overview + - univariate analysis + - bivariate analysis + - multivariate analysis + - group analysis (if `groupby` is specified) Parameters ---------- dataframe : pd.DataFrame Data from which to generate the report. - use_default_sections : bool (default = True) - If True, all default sections of the report are added, otherwise you have to add - the sections manually using add_
() methods. - Default sections for this report are: overview, univariate analysis, bivariate analysis and - multivariate analysis. verbosity : int (default = 0) The default verbosity for the exported code of the entire report, has to be one of [0, 1, 2], by default 0. @@ -701,7 +715,6 @@ class Report(ReportBase): def __init__( self, dataframe: pd.DataFrame, - use_default_sections: bool = True, verbosity: int = 0, verbosity_overview: Optional[int] = None, verbosity_univariate_analysis: Optional[int] = None, @@ -715,7 +728,7 @@ def __init__( columns_group_analysis: Optional[List[str]] = None, groupby: Union[str, List[str]] = None, ): - super().__init__(dataframe, use_default_sections, verbosity) + super().__init__(dataframe, verbosity) # If section verbosities are not set, default to the global verbosity if verbosity_overview is None: @@ -726,90 +739,53 @@ def __init__( verbosity_bivariate_analysis = verbosity if verbosity_multivariate_analysis is None: verbosity_multivariate_analysis = verbosity - # Add default sections if user doesn't build the report manually - if use_default_sections: - self.add_table_of_contents(include_subsections=True) - self.add_overview(verbosity=verbosity_overview, use_columns=columns_overview) - self.add_univariate_analysis( - verbosity=verbosity_univariate_analysis, - use_columns=columns_univariate_analysis, - ) - if isinstance(groupby, str): - color_col = groupby - elif hasattr(groupby, "__len__") and len(groupby) == 1: - color_col = groupby[0] - else: - color_col = None - self.add_bivariate_analysis( - verbosity=verbosity_bivariate_analysis, - use_columns=columns_bivariate_analysis, - color_col=color_col, - ) - self.add_multivariate_analysis( - verbosity=verbosity_multivariate_analysis, - use_columns=columns_multivariate_analysis, - color_col=color_col, + + # Add default sections + self.add_table_of_contents(include_subsections=True) + self.add_overview(verbosity=verbosity_overview, use_columns=columns_overview) + self.add_univariate_analysis( + verbosity=verbosity_univariate_analysis, + use_columns=columns_univariate_analysis, + ) + if isinstance(groupby, str): + color_col = groupby + elif hasattr(groupby, "__len__") and len(groupby) == 1: + color_col = groupby[0] + else: + color_col = None + self.add_bivariate_analysis( + verbosity=verbosity_bivariate_analysis, + use_columns=columns_bivariate_analysis, + color_col=color_col, + ) + self.add_multivariate_analysis( + verbosity=verbosity_multivariate_analysis, + use_columns=columns_multivariate_analysis, + color_col=color_col, + ) + if groupby is not None: + self.add_group_analysis( + groupby=groupby, + use_columns=columns_group_analysis, + verbosity=verbosity_group_analysis, ) - if groupby is not None: - self.add_group_analysis( - groupby=groupby, - use_columns=columns_group_analysis, - verbosity=verbosity_group_analysis, - ) class TimeseriesReport(ReportBase): - """This class instantiates an object that the edvart user should mainly use for report - configuration and export, specifically for timeseries data. + """ + A report for time-series data. Contains no sections by default. - Parameters - ---------- - dataframe : pd.DataFrame - Data from which to generate the report. Data needs to be indexed by time: pd.DateTimeIndex - or pd.PeriodIndex. - The data is assumed to be sorted according to the time index in ascending order. - use_default_sections : bool, optional - If True, all default sections of the report are added, otherwise you have to add - the sections manually using add_
() methods. - Default sections for this report are overview, univariate analysis and timeseries analysis. - verbosity : int (default = 0) - The default verbosity for the exported code of the entire report, has to be one of - [0, 1, 2], by default 0. - verbosity_overview : int, optional - Verbosity of the overview section - verbosity_univariate_analysis : int, optional - Verbosity of the univariate analysis section - verbosity_timeseries_analysis : int, optional - Verbosity of the timeseries analysis section - columns_overview : List[str], optional - Subset of columns to use in overview section - columns_univariate_analysis : List[str], optional - Subset of columns to use in univariate analysis section - columns_timeseries_analysis : List[str], optional - Subset of columns to use in timeseries analysis section - sampling_rate : int, optional - Sampling rate for Fourier transform and Short-time Fourier transform subsections. Determines - frequency unit for analysis of frequencies, for example with monthly data and sampling rate - 12, yearly frequncy spectrum is produced. - If not set, these two sections will not be included. - stft_window_size : int, optional - Windows size for short-time Fourier transform subsection. If not set, STFT will be exluded. + See `DefaultTimeseriesReport` for a time-series report with default sections. + See methods `add_*` for adding sections to the report. + + Raises + ------ + ValueError + If the input dataframe is not indexed by time. """ - def __init__( - self, - dataframe: pd.DataFrame, - use_default_sections: bool = True, - verbosity: int = 0, - verbosity_overview: Optional[int] = None, - verbosity_univariate_analysis: Optional[int] = None, - verbosity_timeseries_analysis: Optional[int] = None, - columns_overview: Optional[List[str]] = None, - columns_univariate_analysis: Optional[List[str]] = None, - columns_timeseries_analysis: Optional[List[str]] = None, - sampling_rate: Optional[int] = None, - stft_window_size: Optional[int] = None, - ): + def __init__(self, dataframe: pd.DataFrame, verbosity: int = 0): + super().__init__(dataframe, verbosity) if not is_date(dataframe.index): raise ValueError( "Input dataframe needs to be indexed by time." @@ -821,27 +797,6 @@ def __init__( dataframe.index = pd.PeriodIndex(dataframe.index) else: dataframe.index = pd.DatetimeIndex(dataframe.index) - super().__init__(dataframe, use_default_sections, verbosity) - - if verbosity_overview is None: - verbosity_overview = verbosity - if verbosity_univariate_analysis is None: - verbosity_univariate_analysis = verbosity - if verbosity_timeseries_analysis is None: - verbosity_timeseries_analysis = verbosity - if use_default_sections: - self.add_table_of_contents(include_subsections=True) - self.add_overview(verbosity=verbosity_overview, use_columns=columns_overview) - self.add_univariate_analysis( - verbosity=verbosity_univariate_analysis, - use_columns=columns_univariate_analysis, - ) - self.add_timeseries_analysis( - verbosity=verbosity_timeseries_analysis, - use_columns=columns_timeseries_analysis, - sampling_rate=sampling_rate, - stft_window_size=stft_window_size, - ) def add_timeseries_analysis( self, @@ -928,3 +883,76 @@ def add_timeseries_analysis( ) return self + + +class DefaultTimeseriesReport(TimeseriesReport): + """A default report for time series data. + + The report contains the following sections: + - dataset overview + - univariate analysis + - timeseries analysis + + Parameters + ---------- + dataframe : pd.DataFrame + Data from which to generate the report. Data needs to be indexed by time: pd.DateTimeIndex + or pd.PeriodIndex. + The data is assumed to be sorted according to the time index in ascending order. + verbosity : int (default = 0) + The default verbosity for the exported code of the entire report, has to be one of + [0, 1, 2], by default 0. + verbosity_overview : int, optional + Verbosity of the overview section + verbosity_univariate_analysis : int, optional + Verbosity of the univariate analysis section + verbosity_timeseries_analysis : int, optional + Verbosity of the timeseries analysis section + columns_overview : List[str], optional + Subset of columns to use in overview section + columns_univariate_analysis : List[str], optional + Subset of columns to use in univariate analysis section + columns_timeseries_analysis : List[str], optional + Subset of columns to use in timeseries analysis section + sampling_rate : int, optional + Sampling rate for Fourier transform and Short-time Fourier transform subsections. Determines + frequency unit for analysis of frequencies, for example with monthly data and sampling rate + 12, yearly frequncy spectrum is produced. + If not set, these two sections will not be included. + stft_window_size : int, optional + Windows size for short-time Fourier transform subsection. If not set, STFT will be exluded. + """ + + def __init__( + self, + dataframe: pd.DataFrame, + verbosity: int = 0, + verbosity_overview: Optional[int] = None, + verbosity_univariate_analysis: Optional[int] = None, + verbosity_timeseries_analysis: Optional[int] = None, + columns_overview: Optional[List[str]] = None, + columns_univariate_analysis: Optional[List[str]] = None, + columns_timeseries_analysis: Optional[List[str]] = None, + sampling_rate: Optional[int] = None, + stft_window_size: Optional[int] = None, + ): + super().__init__(dataframe, verbosity) + + if verbosity_overview is None: + verbosity_overview = verbosity + if verbosity_univariate_analysis is None: + verbosity_univariate_analysis = verbosity + if verbosity_timeseries_analysis is None: + verbosity_timeseries_analysis = verbosity + self.add_table_of_contents(include_subsections=True) + self.add_overview(verbosity=verbosity_overview, use_columns=columns_overview) + self.add_univariate_analysis( + verbosity=verbosity_univariate_analysis, + use_columns=columns_univariate_analysis, + ) + self.add_timeseries_analysis( + verbosity=verbosity_timeseries_analysis, + use_columns=columns_timeseries_analysis, + sampling_rate=sampling_rate, + stft_window_size=stft_window_size, + ) diff --git a/tests/test_report.py b/tests/test_report.py index 5439b38..bcf4da7 100644 --- a/tests/test_report.py +++ b/tests/test_report.py @@ -4,14 +4,54 @@ import numpy as np import pandas as pd -from edvart import Report +from edvart.report import DefaultReport, Report -def test_column_selection(): - test_df = pd.DataFrame( +def _get_test_df() -> pd.DataFrame: + return pd.DataFrame( data=np.random.random_sample((50, 20)), columns=[f"Col{i}" for i in range(20)] ) - report = Report(dataframe=test_df, use_default_sections=False) + + +def test_report(): + report = Report(dataframe=_get_test_df()) + assert len(report.sections) == 0, "Report should be empty" + + report.add_overview(verbosity=1) + assert len(report.sections) == 1, "Report should have one section" + + report.add_bivariate_analysis(verbosity=2, use_columns=["Col1", "Col2", "Col3"]) + assert len(report.sections) == 2, "Report should have two sections" + + assert report.sections[0].name == "Overview", "Wrong section name" + assert report.sections[0].verbosity == 1, "Wrong section verbosity" + assert report.sections[0].columns is None, "Default column selection should be None" + + assert report.sections[1].columns == ["Col1", "Col2", "Col3"], "Wrong columns" + + +def test_default_report(): + report = DefaultReport( + dataframe=_get_test_df(), + verbosity_overview=1, + verbosity_univariate_analysis=2, + columns_bivariate_analysis=["Col1", "Col2", "Col3"], + ) + assert len(report.sections) > 0, "Default report should not be empty" + + assert report.sections[1].verbosity == 1, "Wrong section verbosity" + assert report.sections[1].columns is None, "Default column selection should be None" + + assert report.sections[2].verbosity == 2, "Wrong section verbosity" + assert report.sections[2].columns is None, "Default column selection should be None" + + assert report.sections[3].verbosity == 0, "Wrong section verbosity" + assert report.sections[3].columns == ["Col1", "Col2", "Col3"], "Wrong columns" + + +def test_column_selection(): + test_df = _get_test_df() + report = Report(dataframe=test_df) # Default column selection report.add_overview() @@ -34,10 +74,8 @@ def test_column_selection(): def test_show(): - test_df = pd.DataFrame( - data=np.random.random_sample((50, 20)), columns=[f"Col{i}" for i in range(20)] - ) - report = Report(dataframe=test_df, use_default_sections=False) + test_df = _get_test_df() + report = Report(dataframe=test_df) with warnings.catch_warnings(): warnings.simplefilter("ignore", UserWarning)