From e49dfa5000f29229a9eb7ce4aca88eb7c0d7972c Mon Sep 17 00:00:00 2001
From: christopherbunn <chris.l.bunn@gmail.com>
Date: Wed, 31 Jan 2024 15:01:57 -0500
Subject: [PATCH] Code cleanup

---
 .../preprocessing/time_series_featurizer.py   | 71 ++++++++++---------
 .../multiseries_regression_pipeline.py        |  9 ++-
 .../automl_tests/test_iterative_algorithm.py  |  1 +
 3 files changed, 45 insertions(+), 36 deletions(-)

diff --git a/evalml/pipelines/components/transformers/preprocessing/time_series_featurizer.py b/evalml/pipelines/components/transformers/preprocessing/time_series_featurizer.py
index 61246a78d2..4726f2710d 100644
--- a/evalml/pipelines/components/transformers/preprocessing/time_series_featurizer.py
+++ b/evalml/pipelines/components/transformers/preprocessing/time_series_featurizer.py
@@ -127,8 +127,16 @@ def fit(self, X, y=None):
         if self.time_index is None:
             raise ValueError("time_index cannot be None!")
 
-        # For the multiseries case, each series ID has individualized lag values
-        if isinstance(y, pd.DataFrame):
+        if y is None:
+            # Set lags to all possible lag values
+            self.statistically_significant_lags = np.arange(
+                self.start_delay,
+                self.start_delay + self.max_delay + 1,
+            )
+        else:
+            # For the multiseries case, each series ID has individualized lag values
+            if isinstance(y, pd.Series):
+                y = y.to_frame()
             self.statistically_significant_lags = {}
             for column in y.columns:
                 self.statistically_significant_lags[
@@ -139,13 +147,11 @@ def fit(self, X, y=None):
                     start_delay=self.start_delay,
                     max_delay=self.max_delay,
                 )
-        else:
-            self.statistically_significant_lags = self._find_significant_lags(
-                y,
-                conf_level=self.conf_level,
-                start_delay=self.start_delay,
-                max_delay=self.max_delay,
-            )
+                if len(y.columns) == 1:
+                    self.statistically_significant_lags = (
+                        self.statistically_significant_lags[column]
+                    )
+                    return self
         return self
 
     @staticmethod
@@ -169,31 +175,28 @@ def _encode_X_while_preserving_index(X_categorical):
     @staticmethod
     def _find_significant_lags(y, conf_level, start_delay, max_delay):
         all_lags = np.arange(start_delay, start_delay + max_delay + 1)
-        if y is not None:
-            # Compute the acf and find its peaks
-            acf_values, ci_intervals = acf(
-                y,
-                nlags=len(y) - 1,
-                fft=True,
-                alpha=conf_level,
-            )
-            peaks, _ = find_peaks(acf_values)
-            # Significant lags are the union of:
-            # 1. the peaks (local maxima) that are significant
-            # 2. The significant lags among the first 10 lags.
-            # We then filter the list to be in the range [start_delay, start_delay + max_delay]
-            index = np.arange(len(acf_values))
-            significant = np.logical_or(ci_intervals[:, 0] > 0, ci_intervals[:, 1] < 0)
-            first_significant_10 = index[:10][significant[:10]]
-            significant_lags = (
-                set(index[significant]).intersection(peaks).union(first_significant_10)
-            )
-            # If no lags are significant get the first lag
-            significant_lags = sorted(significant_lags.intersection(all_lags)) or [
-                start_delay,
-            ]
-        else:
-            significant_lags = all_lags
+        # Compute the acf and find its peaks
+        acf_values, ci_intervals = acf(
+            y,
+            nlags=len(y) - 1,
+            fft=True,
+            alpha=conf_level,
+        )
+        peaks, _ = find_peaks(acf_values)
+        # Significant lags are the union of:
+        # 1. the peaks (local maxima) that are significant
+        # 2. The significant lags among the first 10 lags.
+        # We then filter the list to be in the range [start_delay, start_delay + max_delay]
+        index = np.arange(len(acf_values))
+        significant = np.logical_or(ci_intervals[:, 0] > 0, ci_intervals[:, 1] < 0)
+        first_significant_10 = index[:10][significant[:10]]
+        significant_lags = (
+            set(index[significant]).intersection(peaks).union(first_significant_10)
+        )
+        # If no lags are significant get the first lag
+        significant_lags = sorted(significant_lags.intersection(all_lags)) or [
+            start_delay,
+        ]
         return significant_lags
 
     def _compute_rolling_transforms(self, X, y, original_features):
diff --git a/evalml/pipelines/multiseries_regression_pipeline.py b/evalml/pipelines/multiseries_regression_pipeline.py
index 6c0cf64b98..df9bce6632 100644
--- a/evalml/pipelines/multiseries_regression_pipeline.py
+++ b/evalml/pipelines/multiseries_regression_pipeline.py
@@ -166,17 +166,21 @@ def predict_in_sample(
         unstacked_predictions = unstacked_predictions[
             [
                 series_id_target
-                for series_id_target in y_train_unstacked.columns
+                for series_id_target in self.series_id_target_names
                 if series_id_target in unstacked_predictions.columns
             ]
         ]
+
+        # Add `time_index` column to index for generating stacked datetime column in `stack_data()`
         unstacked_predictions.index = X_unstacked[self.time_index]
         stacked_predictions = stack_data(
             unstacked_predictions,
             include_series_id=True,
             series_id_name=self.series_id,
         )
-        stacked_predictions = stacked_predictions.reset_index()
+        # Move datetime index into separate date column to use when merging later
+        stacked_predictions = stacked_predictions.reset_index(drop=False)
+
         sp_dtypes = {
             self.time_index: X[self.time_index].dtype,
             self.series_id: X[self.series_id].dtype,
@@ -195,6 +199,7 @@ def predict_in_sample(
             stacked_predictions,
             on=[self.time_index, self.series_id],
         )[output_cols]
+
         # Index will start at the unstacked index, so we need to reset it to the original index
         stacked_predictions.index = X.index
 
diff --git a/evalml/tests/automl_tests/test_iterative_algorithm.py b/evalml/tests/automl_tests/test_iterative_algorithm.py
index 978d212c06..4fc4f0f538 100644
--- a/evalml/tests/automl_tests/test_iterative_algorithm.py
+++ b/evalml/tests/automl_tests/test_iterative_algorithm.py
@@ -108,6 +108,7 @@ def test_iterative_algorithm_init(
                 parameters=search_parameters,
             )
             for estimator in estimators
+            # Generate both decomposer and non-decomposer pipelines when problem type is multiseries time series reg.
             for include_decomposer in (
                 [True, False] if is_regression(problem_type) else [False]
             )