From 8f36847e9addc5a2d8f1965b1e8067f9b8337daf Mon Sep 17 00:00:00 2001
From: architgajjar8464 <architgajjar8464@gmail.com>
Date: Thu, 21 Mar 2024 14:30:53 -0700
Subject: [PATCH] Appliances Energy Prediction Dataset under Test

---
 training/tests/test_registry.py               |   1 +
 .../datasets/_appliances_energy_prediction.py | 205 ++++++++++++++++++
 2 files changed, 206 insertions(+)
 create mode 100644 training/xtime/datasets/_appliances_energy_prediction.py

diff --git a/training/tests/test_registry.py b/training/tests/test_registry.py
index bcf9d0a..06e5cd9 100644
--- a/training/tests/test_registry.py
+++ b/training/tests/test_registry.py
@@ -54,6 +54,7 @@ def test_dataset_registry(self):
             "madeline",
             "fraud_detection",
             "harth",
+            "appliances_energy_prediction",
         ]
 
         self.assertIsInstance(names, list)
diff --git a/training/xtime/datasets/_appliances_energy_prediction.py b/training/xtime/datasets/_appliances_energy_prediction.py
new file mode 100644
index 0000000..ee2dd1b
--- /dev/null
+++ b/training/xtime/datasets/_appliances_energy_prediction.py
@@ -0,0 +1,205 @@
+###
+# Copyright (2023) Hewlett Packard Enterprise Development LP
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# You may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+###
+
+import logging
+import os
+from datetime import datetime
+from pathlib import Path
+
+import pandas as pd
+from sklearn.model_selection import train_test_split
+from sklearn.preprocessing import LabelEncoder
+
+from xtime.datasets import Dataset, DatasetBuilder, DatasetMetadata, DatasetSplit
+from xtime.datasets.preprocessing import TimeSeries, TimeSeriesEncoderV1
+from xtime.errors import DatasetError
+from xtime.ml import Feature, FeatureType, RegressionTask, TaskType
+
+__all__ = ["AEPBuilder"]
+
+logger = logging.getLogger(__name__)
+
+
+_XTIME_DATASETS_AEP = "XTIME_DATASETS_AEP"
+"""Environment variable that points to a directory with AEP dataset."""
+
+_AEP_HOME_PAGE = "https://archive.ics.uci.edu/dataset/374/appliances+energy+prediction"
+"""Dataset home page."""
+
+_AEP_DATASET_FILE = "energydata_complete.csv"
+"""File containing raw (unprocessed) AEP dataset that is located inside _XTIME_DATASETS_AEP directory."""
+
+
+class AEPBuilder(DatasetBuilder):
+    """AEP: Appliances Energy Prediction.
+
+    Experimental data used to create regression models of appliances energy use in a low energy building.
+    The data set is at 10 min for about 4.5 months. The house temperature and humidity conditions were
+    monitored with a ZigBee wireless sensor network. Each wireless node transmitted the temperature and
+    humidity conditions around 3.3 min. Then, the wireless data was averaged for 10 minutes periods.
+    The energy data was logged every 10 minutes with m-bus energy meters. 
+        https://archive.ics.uci.edu/dataset/374/appliances+energy+prediction
+    """
+
+    NAME = "appliances_energy_prediction"
+
+    def __init__(self) -> None:
+        super().__init__()
+        self.builders.update(default=self._build_default_dataset)
+        self.encoder = TimeSeriesEncoderV1()
+
+    def _check_pre_requisites(self) -> None:
+        # Check raw dataset exists.
+        if _XTIME_DATASETS_AEP not in os.environ:
+            raise DatasetError.missing_prerequisites(
+                f"No environment variable found `{_XTIME_DATASETS_AEP}` that should point to a directory with "
+                f"AEP (Appliances Energy Prediction) dataset `{_AEP_DATASET_FILE}` that can be"
+                f"downloaded from `{_AEP_HOME_PAGE}`."
+            )
+        self._dataset_dir = Path(os.environ[_XTIME_DATASETS_AEP]).absolute()
+        if self._dataset_dir.is_file():
+            self._dataset_dir = self._dataset_dir.parent
+        if not (self._dataset_dir / _AEP_DATASET_FILE).is_file():
+            raise DatasetError.missing_prerequisites(
+                f"AEP (Appliances Energy Prediction) dataset location was identified as `{self._dataset_dir}`, but this is "
+                f"either not a directory or dataset file (`{_AEP_DATASET_FILE}`) not found in this location. "
+                f" Please, download `{_AEP_DATASET_FILE}` of this dataset from its home page `{_AEP_HOME_PAGE}`."
+            )
+
+        # Check `tsfresh` library can be imported.
+        try:
+            import tsfresh.feature_extraction.feature_calculators as ts_features
+
+        except ImportError:
+            raise DatasetError.missing_prerequisites(
+                f"The AEP dataset requires `tsfresh` library to compute ML features. If it has not been installed, "
+                "please install it with `pip install tsfresh==0.20.2`. If it is installed, there may be incompatible "
+                "CUDA runtime found (see if the cause for the import error is "
+                "`numba.cuda.cudadrv.error.NvvmSupportError` exception) - this may occur because `tsfresh` depends on "
+                "`stumpy` that depends on `numba` that detects CUDA runtime and tries to use it if available. Try "
+                "disabling CUDA for numba by exporting NUMBA_DISABLE_CUDA environment variable "
+                "(https://numba.pydata.org/numba-doc/dev/reference/envvars.html#envvar-NUMBA_DISABLE_CUDA): "
+                "`export NUMBA_DISABLE_CUDA=1`."
+            )
+
+    def _build_default_dataset(self, **kwargs) -> Dataset:
+        if kwargs:
+            raise ValueError(f"{self.__class__.__name__}: `default` dataset does not accept arguments.")
+        self._clean_dataset()
+        self._create_default_dataset()
+
+        train_df = pd.read_csv(self._dataset_dir / (_AEP_DATASET_FILE[0:-4] + "-default-train.csv"))
+        test_df = pd.read_csv(self._dataset_dir / (_AEP_DATASET_FILE[0:-4] + "-default-test.csv"))
+        
+        _ordinal_features = ["year", "month", "day", "hour", "min"]
+        _drop_features = ["Appliances"]
+        
+        features = [] 
+        for feature in train_df.columns:
+            if feature in _drop_features:
+                continue
+            if feature in _ordinal_features:
+                features.append(Feature(feature, FeatureType.ORDINAL))
+            else:
+                features.append(Feature(feature, FeatureType.CONTINUOUS))
+        
+        # Check that data frames contains expected columns
+        assert train_df.shape[1] == len(features) + 1, "Train data frame contains wrong number of columns."
+        assert test_df.shape[1] == len(features) + 1, "Test data frame contains wrong number of columns."
+        for feature in features:
+            assert (
+                feature.name in train_df.columns
+            ), f"Missing column `{feature}` in train dataframe (columns={list(train_df.columns)})."
+            assert (
+                feature.name in test_df.columns
+            ), f"Missing column `{feature}` in test dataframe (columns={list(train_df.columns)})."
+
+        target: str = "Appliances"
+
+        dataset = Dataset(
+            metadata=DatasetMetadata(
+                name=AEPBuilder.NAME,
+                version="default",
+                task=RegressionTask(TaskType.REGRESSION),
+                features=features,
+                properties={"source": self._dataset_dir.as_uri()},
+            ),
+            splits={
+                DatasetSplit.TRAIN: DatasetSplit(x=train_df.drop(target, axis=1, inplace=False), y=train_df[target]),
+                DatasetSplit.TEST: DatasetSplit(x=test_df.drop(target, axis=1, inplace=False), y=test_df[target]),
+            },
+        )
+        return dataset
+
+    def _clean_dataset(self) -> None:
+        """Clean raw AEP dataset."""
+        # Do not clean it again if it has already been cleaned.
+        # Dataset provides a single file in `.csv` format with missing values
+        # We will use the raw file directly
+        _clean_dataset_file = (self._dataset_dir / _AEP_DATASET_FILE).with_suffix(".csv")
+        if _clean_dataset_file.is_file():
+            return
+
+    def _create_default_dataset(self) -> None:
+        """Create default train/test splits and save them to files.
+
+        Input to this function is the clean dataset created by the `_clean_dataset` method of this class.
+        """
+        # Do not generate datasets if they have already been generated.
+        default_train_dataset_file = self._dataset_dir / (_AEP_DATASET_FILE[0:-4] + "-default-train.csv")
+        default_test_dataset_file = self._dataset_dir / (_AEP_DATASET_FILE[0:-4] + "-default-test.csv")
+        if default_train_dataset_file.is_file() and default_test_dataset_file.is_file():
+            return
+
+        # Load clean dataset into a data frame (No,year,month,day,hour,pm2.5,DEWP,TEMP,PRES,cbwd,Iws,Is,Ir)
+        clean_dataset_file = (self._dataset_dir / _AEP_DATASET_FILE).with_suffix(".csv")
+        assert clean_dataset_file.is_file(), f"Clean dataset does not exist (this is internal error)."
+
+        df: pd.DataFrame = pd.read_csv(clean_dataset_file)
+        # print(f' --- Reading the {clean_dataset_file} file --- ')
+        print(df.head())
+        print(df.dtypes)
+
+        # Sanity check for missing values
+        assert not df.isna().any().any(), "There are missing values in the DataFrame"
+        
+        assert df.shape[1] == 29, f"Clean dataset expected to have 29 columns (shape={df.shape})."
+        
+        # Separate columns for: year, month, day, hour from `df[date]`
+        
+        # convert the date column into a datetime object
+        df['date'] = pd.to_datetime(df['date'])
+        
+        # extract the day, month, and year components
+        df['day'] = df['date'].dt.day
+        df['month'] = df['date'].dt.month
+        df['year'] = df['date'].dt.year
+        df['hour'] = df['date'].dt.hour
+        df['min'] = df['date'].dt.minute
+        
+        # show the modified data frame
+        print(df)
+        
+        # Don't need `df[date]`
+        df = df.drop("date", axis=1)
+        
+        assert df.shape[1] == 33, f"Clean dataset expected to have 33 columns (shape={df.shape})."
+        print(df.head())
+        # Split train and test dataframes
+        df_train, df_test = train_test_split(df, test_size=0.2)
+
+        df_train.to_csv(default_train_dataset_file, index=False)
+        df_test.to_csv(default_test_dataset_file, index=False)