diff --git a/.gitignore b/.gitignore index 959f691..3ac7b25 100644 --- a/.gitignore +++ b/.gitignore @@ -64,7 +64,6 @@ wheels/ MANIFEST .idea - # PyInstaller # Usually these files are written by a python script from a template # before PyInstaller builds the exe, so as to inject date/other infos into it. @@ -145,4 +144,4 @@ venv.bak/ # Exceptions !docs/requirements.txt -!data/rainbow_beach.parquet \ No newline at end of file +!sam/datasets/data/*.csv diff --git a/CHANGELOG.md b/CHANGELOG.md index 52ba4ba..cd68299 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,6 +9,20 @@ Version X.Y.Z stands for: ------------- +## Version 3.1.0 + +### New features + +- New class `sam.models.LassoTimeseriesRegressor` to create a Lasso regression model for time series data incl. quantile predictions. +- New class `sam.preprocessing.ClipTransformer` to clip input values to the range from the train set, making models more robust again +- New abstract base class `sam.validation.BaseValidator` for all validators. +- Renamed `sam.validation.RemoveFlatlines` to `sam.validation.FlatlineValidator`. `sam.validation.RemoveFlatlines` is still available, but removed in future versions. +- Renamed `sam.validation.RemoveExtremeValues` to `sam.validation.MADValidator`. `sam.validation.RemoveExtremeValues` is still available, but removed in future versions. +- New class `sam.validation.OutsideRangeValidator` for checking / removing data outside of a range. +- New function `datetime_train_test_split` to split pandas dataframes and series based on a datetime. +- New `sam.datasets` module containing functions for loading read-to-use datasets: `sam.datasets.load_rainbow_beach` and `sam.datasets.load_sewage_data`. +st outliers. + ## Version 3.0.4 ### Changes @@ -19,6 +33,7 @@ Version X.Y.Z stands for: - Updated package dependencies for scikit-learn - Changed the DeepExplainer to the model agnostic KernelExplainer, so we can remove all the v1 dependencies on tensorflow - Fixed pytest MPL bug by temporarily setting it to a previous version + ## Version 3.0.3 ### New features diff --git a/README.md b/README.md index de3ec3d..cbb53c2 100755 --- a/README.md +++ b/README.md @@ -32,11 +32,11 @@ Keep in mind that the sam package is updated frequently, and after a while, your Below you can find a simple example on how to use one of our timeseries models. For more examples, check our [example notebooks](https://github.com/RoyalHaskoningDHV/sam/tree/main/examples) ```python -import pandas as pd +from sam.datasets import load_rainbow_beach from sam.models import MLPTimeseriesRegressor from sam.feature_engineering import SimpleFeatureEngineer -data = pd.read_parquet("../data/rainbow_beach.parquet") # Requires `pyarrow` package +data = load_rainbow_beach() X, y = data, data["water_temperature"] # Easily create rolling and time features to be used by the model diff --git a/data/rainbow_beach.parquet b/data/rainbow_beach.parquet deleted file mode 100644 index 81c0e06..0000000 Binary files a/data/rainbow_beach.parquet and /dev/null differ diff --git a/docs/source/datasets.rst b/docs/source/datasets.rst new file mode 100644 index 0000000..a82cfa7 --- /dev/null +++ b/docs/source/datasets.rst @@ -0,0 +1,18 @@ +.. _datasets: + +============= +Data Sets +============= + +This is the documentation for available datasets. + +Rainbow Beach +------------- +.. autofunction:: sam.datasets.load_rainbow_beach + + +Sewage data +----------- +.. autofunction:: sam.datasets.load_sewage_data + + diff --git a/docs/source/index.rst b/docs/source/index.rst index c574c7d..ca13c8f 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -16,6 +16,7 @@ Welcome to SAM's documentation! data examples data_sources + datasets preprocessing exploration feature_engineering diff --git a/docs/source/preprocessing.rst b/docs/source/preprocessing.rst index 14eefc7..ef463fe 100644 --- a/docs/source/preprocessing.rst +++ b/docs/source/preprocessing.rst @@ -6,6 +6,13 @@ Preprocessing This is the documentation for preprocessing functions. +Clipping data +------------- +.. autoclass:: sam.preprocessing.ClipTransformer + :members: + :undoc-members: + :show-inheritance: + Normalize timestamps -------------------- .. warning:: diff --git a/docs/source/validation.rst b/docs/source/validation.rst index 4ad3e1d..a319f35 100644 --- a/docs/source/validation.rst +++ b/docs/source/validation.rst @@ -6,9 +6,23 @@ Data Validation This is the documentation for the validation functions. +Base Validation class +--------------------- +.. autoclass:: sam.validation.BaseValidator + :members: + :undoc-members: + :show-inheritance: + +Detect Outside Range +-------------------- +.. autoclass:: sam.validation.OutsideRangeValidator + :members: + :undoc-members: + :show-inheritance: + Detect Extreme Values --------------------------- -.. autoclass:: sam.validation.RemoveExtremeValues +.. autoclass:: sam.validation.MADValidator :members: :undoc-members: :show-inheritance: @@ -23,7 +37,7 @@ Testset image: Detect Flatlines --------------------------- -.. autoclass:: sam.validation.RemoveFlatlines +.. autoclass:: sam.validation.FlatlineValidator :members: :undoc-members: :show-inheritance: diff --git a/examples/feature_engineering.ipynb b/examples/feature_engineering.ipynb index c5f3712..2d49110 100644 --- a/examples/feature_engineering.ipynb +++ b/examples/feature_engineering.ipynb @@ -130,10 +130,9 @@ ], "source": [ "import pandas as pd\n", + "from sam.datasets import load_rainbow_beach\n", "\n", - "data = pd.read_parquet('../data/rainbow_beach.parquet')\n", - "\n", - "data.head()" + "data = load_rainbow_beach()" ] }, { diff --git a/examples/lasso.ipynb b/examples/lasso.ipynb new file mode 100644 index 0000000..f1c980c --- /dev/null +++ b/examples/lasso.ipynb @@ -0,0 +1,431 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Lasso regression for time series example\n", + "\n", + "This notebooks provides an example on how to create a linear model (Lasso) with SAM.\n", + "\n", + "The timeseries model utilizes the feature engineering capabilities of SAM. To learn more about feature engineering, see the notebook `feature_engineering.ipynb` and the [Feature Engineering](https://sam.nist.gov/docs/feature-engineering) section of the SAM documentation." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "%load_ext autoreload\n", + "%autoreload 2" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2022-08-18 15:12:22.079152: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory\n", + "2022-08-18 15:12:22.079547: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.\n" + ] + } + ], + "source": [ + "from sam.models import LassoTimeseriesRegressor\n", + "from sam.feature_engineering import SimpleFeatureEngineer\n", + "from sam.datasets import load_rainbow_beach\n", + "\n", + "import pandas as pd" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + " | batttery_life | \n", + "transducer_depth | \n", + "turbidity | \n", + "water_temperature | \n", + "wave_height | \n", + "wave_period | \n", + "
---|---|---|---|---|---|---|
TIME | \n", + "\n", + " | \n", + " | \n", + " | \n", + " | \n", + " | \n", + " |
2014-06-15 00:00:00 | \n", + "11.6 | \n", + "1.495 | \n", + "0.85 | \n", + "16.6 | \n", + "0.136 | \n", + "3.0 | \n", + "
2014-06-15 01:00:00 | \n", + "11.6 | \n", + "1.420 | \n", + "0.87 | \n", + "16.3 | \n", + "0.117 | \n", + "4.0 | \n", + "
2014-06-15 02:00:00 | \n", + "11.6 | \n", + "1.478 | \n", + "0.79 | \n", + "16.1 | \n", + "0.114 | \n", + "7.0 | \n", + "
2014-06-15 03:00:00 | \n", + "11.6 | \n", + "1.518 | \n", + "0.76 | \n", + "15.9 | \n", + "0.111 | \n", + "3.0 | \n", + "
2014-06-15 04:00:00 | \n", + "11.6 | \n", + "1.507 | \n", + "0.77 | \n", + "15.7 | \n", + "0.107 | \n", + "3.0 | \n", + "
LassoTimeseriesRegressor(alpha=0.0001, average_type='median',\n", + " feature_engineer=Pipeline(steps=[('features',\n", + " SimpleFeatureEngineer(rolling_features=[('wave_height',\n", + " 'mean',\n", + " 48),\n", + " ('wave_height',\n", + " 'mean',\n", + " 24),\n", + " ('wave_height',\n", + " 'mean',\n", + " 12),\n", + " ('wave_height',\n", + " 'mean',\n", + " 6),\n", + " ('wave_height',\n", + " 'mean',\n", + " 3)],\n", + " time_features=[('hour_of_day',\n", + " 'onehot'),\n", + " ('day_of_week',\n", + " 'onehot')])),\n", + " ('imputer',\n", + " SimpleImputer()),\n", + " ('scaler',\n", + " StandardScaler())]),\n", + " quantiles=(0.1, 0.9))In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
LassoTimeseriesRegressor(alpha=0.0001, average_type='median',\n", + " feature_engineer=Pipeline(steps=[('features',\n", + " SimpleFeatureEngineer(rolling_features=[('wave_height',\n", + " 'mean',\n", + " 48),\n", + " ('wave_height',\n", + " 'mean',\n", + " 24),\n", + " ('wave_height',\n", + " 'mean',\n", + " 12),\n", + " ('wave_height',\n", + " 'mean',\n", + " 6),\n", + " ('wave_height',\n", + " 'mean',\n", + " 3)],\n", + " time_features=[('hour_of_day',\n", + " 'onehot'),\n", + " ('day_of_week',\n", + " 'onehot')])),\n", + " ('imputer',\n", + " SimpleImputer()),\n", + " ('scaler',\n", + " StandardScaler())]),\n", + " quantiles=(0.1, 0.9))
Pipeline(steps=[('features',\n", + " SimpleFeatureEngineer(rolling_features=[('wave_height', 'mean',\n", + " 48),\n", + " ('wave_height', 'mean',\n", + " 24),\n", + " ('wave_height', 'mean',\n", + " 12),\n", + " ('wave_height', 'mean',\n", + " 6),\n", + " ('wave_height', 'mean',\n", + " 3)],\n", + " time_features=[('hour_of_day', 'onehot'),\n", + " ('day_of_week',\n", + " 'onehot')])),\n", + " ('imputer', SimpleImputer()), ('scaler', StandardScaler())])
SimpleFeatureEngineer(rolling_features=[('wave_height', 'mean', 48),\n", + " ('wave_height', 'mean', 24),\n", + " ('wave_height', 'mean', 12),\n", + " ('wave_height', 'mean', 6),\n", + " ('wave_height', 'mean', 3)],\n", + " time_features=[('hour_of_day', 'onehot'),\n", + " ('day_of_week', 'onehot')])
SimpleImputer()
StandardScaler()