From e49dfa5000f29229a9eb7ce4aca88eb7c0d7972c Mon Sep 17 00:00:00 2001 From: christopherbunn Date: Wed, 31 Jan 2024 15:01:57 -0500 Subject: [PATCH] Code cleanup --- .../preprocessing/time_series_featurizer.py | 71 ++++++++++--------- .../multiseries_regression_pipeline.py | 9 ++- .../automl_tests/test_iterative_algorithm.py | 1 + 3 files changed, 45 insertions(+), 36 deletions(-) diff --git a/evalml/pipelines/components/transformers/preprocessing/time_series_featurizer.py b/evalml/pipelines/components/transformers/preprocessing/time_series_featurizer.py index 61246a78d2..4726f2710d 100644 --- a/evalml/pipelines/components/transformers/preprocessing/time_series_featurizer.py +++ b/evalml/pipelines/components/transformers/preprocessing/time_series_featurizer.py @@ -127,8 +127,16 @@ def fit(self, X, y=None): if self.time_index is None: raise ValueError("time_index cannot be None!") - # For the multiseries case, each series ID has individualized lag values - if isinstance(y, pd.DataFrame): + if y is None: + # Set lags to all possible lag values + self.statistically_significant_lags = np.arange( + self.start_delay, + self.start_delay + self.max_delay + 1, + ) + else: + # For the multiseries case, each series ID has individualized lag values + if isinstance(y, pd.Series): + y = y.to_frame() self.statistically_significant_lags = {} for column in y.columns: self.statistically_significant_lags[ @@ -139,13 +147,11 @@ def fit(self, X, y=None): start_delay=self.start_delay, max_delay=self.max_delay, ) - else: - self.statistically_significant_lags = self._find_significant_lags( - y, - conf_level=self.conf_level, - start_delay=self.start_delay, - max_delay=self.max_delay, - ) + if len(y.columns) == 1: + self.statistically_significant_lags = ( + self.statistically_significant_lags[column] + ) + return self return self @staticmethod @@ -169,31 +175,28 @@ def _encode_X_while_preserving_index(X_categorical): @staticmethod def _find_significant_lags(y, conf_level, start_delay, max_delay): all_lags = np.arange(start_delay, start_delay + max_delay + 1) - if y is not None: - # Compute the acf and find its peaks - acf_values, ci_intervals = acf( - y, - nlags=len(y) - 1, - fft=True, - alpha=conf_level, - ) - peaks, _ = find_peaks(acf_values) - # Significant lags are the union of: - # 1. the peaks (local maxima) that are significant - # 2. The significant lags among the first 10 lags. - # We then filter the list to be in the range [start_delay, start_delay + max_delay] - index = np.arange(len(acf_values)) - significant = np.logical_or(ci_intervals[:, 0] > 0, ci_intervals[:, 1] < 0) - first_significant_10 = index[:10][significant[:10]] - significant_lags = ( - set(index[significant]).intersection(peaks).union(first_significant_10) - ) - # If no lags are significant get the first lag - significant_lags = sorted(significant_lags.intersection(all_lags)) or [ - start_delay, - ] - else: - significant_lags = all_lags + # Compute the acf and find its peaks + acf_values, ci_intervals = acf( + y, + nlags=len(y) - 1, + fft=True, + alpha=conf_level, + ) + peaks, _ = find_peaks(acf_values) + # Significant lags are the union of: + # 1. the peaks (local maxima) that are significant + # 2. The significant lags among the first 10 lags. + # We then filter the list to be in the range [start_delay, start_delay + max_delay] + index = np.arange(len(acf_values)) + significant = np.logical_or(ci_intervals[:, 0] > 0, ci_intervals[:, 1] < 0) + first_significant_10 = index[:10][significant[:10]] + significant_lags = ( + set(index[significant]).intersection(peaks).union(first_significant_10) + ) + # If no lags are significant get the first lag + significant_lags = sorted(significant_lags.intersection(all_lags)) or [ + start_delay, + ] return significant_lags def _compute_rolling_transforms(self, X, y, original_features): diff --git a/evalml/pipelines/multiseries_regression_pipeline.py b/evalml/pipelines/multiseries_regression_pipeline.py index 6c0cf64b98..df9bce6632 100644 --- a/evalml/pipelines/multiseries_regression_pipeline.py +++ b/evalml/pipelines/multiseries_regression_pipeline.py @@ -166,17 +166,21 @@ def predict_in_sample( unstacked_predictions = unstacked_predictions[ [ series_id_target - for series_id_target in y_train_unstacked.columns + for series_id_target in self.series_id_target_names if series_id_target in unstacked_predictions.columns ] ] + + # Add `time_index` column to index for generating stacked datetime column in `stack_data()` unstacked_predictions.index = X_unstacked[self.time_index] stacked_predictions = stack_data( unstacked_predictions, include_series_id=True, series_id_name=self.series_id, ) - stacked_predictions = stacked_predictions.reset_index() + # Move datetime index into separate date column to use when merging later + stacked_predictions = stacked_predictions.reset_index(drop=False) + sp_dtypes = { self.time_index: X[self.time_index].dtype, self.series_id: X[self.series_id].dtype, @@ -195,6 +199,7 @@ def predict_in_sample( stacked_predictions, on=[self.time_index, self.series_id], )[output_cols] + # Index will start at the unstacked index, so we need to reset it to the original index stacked_predictions.index = X.index diff --git a/evalml/tests/automl_tests/test_iterative_algorithm.py b/evalml/tests/automl_tests/test_iterative_algorithm.py index 978d212c06..4fc4f0f538 100644 --- a/evalml/tests/automl_tests/test_iterative_algorithm.py +++ b/evalml/tests/automl_tests/test_iterative_algorithm.py @@ -108,6 +108,7 @@ def test_iterative_algorithm_init( parameters=search_parameters, ) for estimator in estimators + # Generate both decomposer and non-decomposer pipelines when problem type is multiseries time series reg. for include_decomposer in ( [True, False] if is_regression(problem_type) else [False] )