Skip to content

Commit

Permalink
Code cleanup
Browse files Browse the repository at this point in the history
  • Loading branch information
christopherbunn committed Jan 31, 2024
1 parent 6f765bb commit e49dfa5
Show file tree
Hide file tree
Showing 3 changed files with 45 additions and 36 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -127,8 +127,16 @@ def fit(self, X, y=None):
if self.time_index is None:
raise ValueError("time_index cannot be None!")

# For the multiseries case, each series ID has individualized lag values
if isinstance(y, pd.DataFrame):
if y is None:
# Set lags to all possible lag values
self.statistically_significant_lags = np.arange(

Check warning on line 132 in evalml/pipelines/components/transformers/preprocessing/time_series_featurizer.py

View check run for this annotation

Codecov / codecov/patch

evalml/pipelines/components/transformers/preprocessing/time_series_featurizer.py#L132

Added line #L132 was not covered by tests
self.start_delay,
self.start_delay + self.max_delay + 1,
)
else:
# For the multiseries case, each series ID has individualized lag values
if isinstance(y, pd.Series):
y = y.to_frame()
self.statistically_significant_lags = {}
for column in y.columns:
self.statistically_significant_lags[
Expand All @@ -139,13 +147,11 @@ def fit(self, X, y=None):
start_delay=self.start_delay,
max_delay=self.max_delay,
)
else:
self.statistically_significant_lags = self._find_significant_lags(
y,
conf_level=self.conf_level,
start_delay=self.start_delay,
max_delay=self.max_delay,
)
if len(y.columns) == 1:
self.statistically_significant_lags = (
self.statistically_significant_lags[column]
)
return self
return self

@staticmethod
Expand All @@ -169,31 +175,28 @@ def _encode_X_while_preserving_index(X_categorical):
@staticmethod
def _find_significant_lags(y, conf_level, start_delay, max_delay):
all_lags = np.arange(start_delay, start_delay + max_delay + 1)
if y is not None:
# Compute the acf and find its peaks
acf_values, ci_intervals = acf(
y,
nlags=len(y) - 1,
fft=True,
alpha=conf_level,
)
peaks, _ = find_peaks(acf_values)
# Significant lags are the union of:
# 1. the peaks (local maxima) that are significant
# 2. The significant lags among the first 10 lags.
# We then filter the list to be in the range [start_delay, start_delay + max_delay]
index = np.arange(len(acf_values))
significant = np.logical_or(ci_intervals[:, 0] > 0, ci_intervals[:, 1] < 0)
first_significant_10 = index[:10][significant[:10]]
significant_lags = (
set(index[significant]).intersection(peaks).union(first_significant_10)
)
# If no lags are significant get the first lag
significant_lags = sorted(significant_lags.intersection(all_lags)) or [
start_delay,
]
else:
significant_lags = all_lags
# Compute the acf and find its peaks
acf_values, ci_intervals = acf(
y,
nlags=len(y) - 1,
fft=True,
alpha=conf_level,
)
peaks, _ = find_peaks(acf_values)
# Significant lags are the union of:
# 1. the peaks (local maxima) that are significant
# 2. The significant lags among the first 10 lags.
# We then filter the list to be in the range [start_delay, start_delay + max_delay]
index = np.arange(len(acf_values))
significant = np.logical_or(ci_intervals[:, 0] > 0, ci_intervals[:, 1] < 0)
first_significant_10 = index[:10][significant[:10]]
significant_lags = (
set(index[significant]).intersection(peaks).union(first_significant_10)
)
# If no lags are significant get the first lag
significant_lags = sorted(significant_lags.intersection(all_lags)) or [
start_delay,
]
return significant_lags

def _compute_rolling_transforms(self, X, y, original_features):
Expand Down
9 changes: 7 additions & 2 deletions evalml/pipelines/multiseries_regression_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -166,17 +166,21 @@ def predict_in_sample(
unstacked_predictions = unstacked_predictions[
[
series_id_target
for series_id_target in y_train_unstacked.columns
for series_id_target in self.series_id_target_names
if series_id_target in unstacked_predictions.columns
]
]

# Add `time_index` column to index for generating stacked datetime column in `stack_data()`
unstacked_predictions.index = X_unstacked[self.time_index]
stacked_predictions = stack_data(
unstacked_predictions,
include_series_id=True,
series_id_name=self.series_id,
)
stacked_predictions = stacked_predictions.reset_index()
# Move datetime index into separate date column to use when merging later
stacked_predictions = stacked_predictions.reset_index(drop=False)

sp_dtypes = {
self.time_index: X[self.time_index].dtype,
self.series_id: X[self.series_id].dtype,
Expand All @@ -195,6 +199,7 @@ def predict_in_sample(
stacked_predictions,
on=[self.time_index, self.series_id],
)[output_cols]

# Index will start at the unstacked index, so we need to reset it to the original index
stacked_predictions.index = X.index

Expand Down
1 change: 1 addition & 0 deletions evalml/tests/automl_tests/test_iterative_algorithm.py
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,7 @@ def test_iterative_algorithm_init(
parameters=search_parameters,
)
for estimator in estimators
# Generate both decomposer and non-decomposer pipelines when problem type is multiseries time series reg.
for include_decomposer in (
[True, False] if is_regression(problem_type) else [False]
)
Expand Down

0 comments on commit e49dfa5

Please sign in to comment.