Skip to content

Commit

Permalink
Consolidated separator symbols and added tests
Browse files Browse the repository at this point in the history
  • Loading branch information
christopherbunn committed Oct 31, 2023
1 parent c24a576 commit dd0346d
Show file tree
Hide file tree
Showing 8 changed files with 62 additions and 27 deletions.
10 changes: 8 additions & 2 deletions evalml/pipelines/time_series_regression_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -225,7 +225,11 @@ def _get_series_intervals(intervals, residuals, trend_pred_intervals, y):
return return_intervals

if self.problem_type == ProblemTypes.MULTISERIES_TIME_SERIES_REGRESSION:
from evalml.pipelines.utils import stack_data, unstack_multiseries
from evalml.pipelines.utils import (

Check warning on line 228 in evalml/pipelines/time_series_regression_pipeline.py

View check run for this annotation

Codecov / codecov/patch

evalml/pipelines/time_series_regression_pipeline.py#L228

Added line #L228 was not covered by tests
MULTISERIES_SEPARATOR_SYMBOL,
stack_data,
unstack_multiseries,
)

X, y = unstack_multiseries(
X,
Expand Down Expand Up @@ -268,7 +272,9 @@ def _get_series_intervals(intervals, residuals, trend_pred_intervals, y):
# `pred_intervals` are in {series_id: {coverage_label: bound_value}} form
for series_id, series_intervals in pred_intervals.items():
series_id_target_name = (
self.input_target_name + "|" + str(series_id)
self.input_target_name
+ MULTISERIES_SEPARATOR_SYMBOL
+ str(series_id)
)
series_id_prediction_intervals = _get_series_intervals(
series_intervals,
Expand Down
15 changes: 10 additions & 5 deletions evalml/pipelines/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,7 @@
from evalml.utils.gen_utils import contains_all_ts_parameters

DECOMPOSER_PERIOD_CAP = 1000
MULTISERIES_SEPARATOR_SYMBOL = "|"


def _get_label_encoder(X, y, problem_type, estimator_class, sampler_name=None):
Expand Down Expand Up @@ -1418,7 +1419,7 @@ def unstack_multiseries(
for column_name in full_dataset.columns.drop([time_index, series_id]):
new_column = single_series[column_name]
new_column.index = new_time_index
new_column.name = f"{column_name}|{s_id}"
new_column.name = f"{column_name}{MULTISERIES_SEPARATOR_SYMBOL}{s_id}"

Check warning on line 1422 in evalml/pipelines/utils.py

View check run for this annotation

Codecov / codecov/patch

evalml/pipelines/utils.py#L1422

Added line #L1422 was not covered by tests

if column_name == target_name:
y_unstacked_cols.append(new_column)
Expand Down Expand Up @@ -1468,7 +1469,9 @@ def stack_data(

# Extract the original column name
series_id_with_name = stacked_series.index.droplevel()
stacked_series.name = "".join(series_id_with_name[0].split("|")[:-1])
stacked_series.name = "".join(

Check warning on line 1472 in evalml/pipelines/utils.py

View check run for this annotation

Codecov / codecov/patch

evalml/pipelines/utils.py#L1472

Added line #L1472 was not covered by tests
series_id_with_name[0].split(MULTISERIES_SEPARATOR_SYMBOL)[:-1],
)

# If the index is the time index, keep it
if not data.index.is_numeric() and starting_index is None:
Expand All @@ -1485,7 +1488,9 @@ def stack_data(
# Pull out the series id information, if requested
if include_series_id:
series_id_col = pd.Series(
series_id_with_name.map(lambda col_name: col_name.split("|")[-1]),
series_id_with_name.map(
lambda col_name: col_name.split(MULTISERIES_SEPARATOR_SYMBOL)[-1],
),
name=series_id_name or "series_id",
index=stacked_series.index,
)
Expand Down Expand Up @@ -1516,8 +1521,8 @@ def stack_X(X, series_id_name, time_index, starting_index=None, series_id_values
for col in X.columns:
if col == time_index:
continue
separated_name = col.split("|")
original_columns.add("".join(separated_name[:-1]))
separated_name = col.split(MULTISERIES_SEPARATOR_SYMBOL)
original_columns.add(MULTISERIES_SEPARATOR_SYMBOL.join(separated_name[:-1]))

Check warning on line 1525 in evalml/pipelines/utils.py

View check run for this annotation

Codecov / codecov/patch

evalml/pipelines/utils.py#L1524-L1525

Added lines #L1524 - L1525 were not covered by tests
series_ids.add(separated_name[-1])

if len(series_ids) == 0:
Expand Down
5 changes: 4 additions & 1 deletion evalml/tests/component_tests/test_time_series_featurizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
)

from evalml.pipelines import TimeSeriesFeaturizer
from evalml.pipelines.utils import MULTISERIES_SEPARATOR_SYMBOL

Check warning on line 18 in evalml/tests/component_tests/test_time_series_featurizer.py

View check run for this annotation

Codecov / codecov/patch

evalml/tests/component_tests/test_time_series_featurizer.py#L18

Added line #L18 was not covered by tests

ROLLING_TRANSFORM_METHOD_NAME = "_compute_rolling_transforms"
DELAYED_FEATURES_METHOD_NAME = "_compute_delays"
Expand Down Expand Up @@ -991,7 +992,9 @@ def test_featurizer_y_dataframe(multiseries_ts_data_unstacked):

assert featurizer.statistically_significant_lags == [6]

expected_y_cols = [f"target|{i}_delay_6" for i in range(y.shape[1])]
expected_y_cols = [

Check warning on line 995 in evalml/tests/component_tests/test_time_series_featurizer.py

View check run for this annotation

Codecov / codecov/patch

evalml/tests/component_tests/test_time_series_featurizer.py#L995

Added line #L995 was not covered by tests
f"target{MULTISERIES_SEPARATOR_SYMBOL}{i}_delay_6" for i in range(y.shape[1])
]
X_t = featurizer.transform(X, y)
for expected_y_col in expected_y_cols:
assert expected_y_col in X_t.columns
13 changes: 11 additions & 2 deletions evalml/tests/component_tests/test_time_series_imputer.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
)

from evalml.pipelines.components import TimeSeriesImputer
from evalml.pipelines.utils import MULTISERIES_SEPARATOR_SYMBOL

Check warning on line 14 in evalml/tests/component_tests/test_time_series_imputer.py

View check run for this annotation

Codecov / codecov/patch

evalml/tests/component_tests/test_time_series_imputer.py#L14

Added line #L14 was not covered by tests


def test_invalid_strategy_parameters():
Expand Down Expand Up @@ -745,7 +746,12 @@ def test_time_series_imputer_multiseries(
_, y_imputed = imputer.transform(X, y)
assert isinstance(y_imputed, pd.DataFrame)

y_expected = pd.DataFrame({f"target|{i}": range(i, 100, 5) for i in range(5)})
y_expected = pd.DataFrame(

Check warning on line 749 in evalml/tests/component_tests/test_time_series_imputer.py

View check run for this annotation

Codecov / codecov/patch

evalml/tests/component_tests/test_time_series_imputer.py#L749

Added line #L749 was not covered by tests
{
f"target{MULTISERIES_SEPARATOR_SYMBOL}{i}": range(i, 100, 5)
for i in range(5)
},
)
assert_frame_equal(y_imputed, y_expected, check_dtype=False)


Expand Down Expand Up @@ -777,7 +783,10 @@ def test_time_series_imputer_multiseries_some_columns_all_nan(
_, y_imputed = imputer.transform(X, y)

y_expected = pd.DataFrame(
{f"target|{i}": range(i, 100, 5) for i in range(num_nan_cols, 5)},
{
f"target{MULTISERIES_SEPARATOR_SYMBOL}{i}": range(i, 100, 5)
for i in range(num_nan_cols, 5)
},
)
assert_frame_equal(y_imputed, y_expected, check_dtype=False)

Expand Down
21 changes: 18 additions & 3 deletions evalml/tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -1094,12 +1094,27 @@ def multiseries_ts_data_stacked():

@pytest.fixture
def multiseries_ts_data_unstacked():
feature_a = pd.DataFrame({f"feature_a|{i}": range(i, 100, 5) for i in range(5)})
from evalml.pipelines.utils import MULTISERIES_SEPARATOR_SYMBOL

Check warning on line 1097 in evalml/tests/conftest.py

View check run for this annotation

Codecov / codecov/patch

evalml/tests/conftest.py#L1097

Added line #L1097 was not covered by tests

feature_a = pd.DataFrame(

Check warning on line 1099 in evalml/tests/conftest.py

View check run for this annotation

Codecov / codecov/patch

evalml/tests/conftest.py#L1099

Added line #L1099 was not covered by tests
{
f"feature_a{MULTISERIES_SEPARATOR_SYMBOL}{i}": range(i, 100, 5)
for i in range(5)
},
)
feature_b = pd.DataFrame(
{f"feature_b|{i}": range(99 - i, -1, -5) for i in range(5)},
{
f"feature_b{MULTISERIES_SEPARATOR_SYMBOL}{i}": range(99 - i, -1, -5)
for i in range(5)
},
)
X = pd.concat([feature_a, feature_b], axis=1)
y = pd.DataFrame({f"target|{i}": range(i, 100, 5) for i in range(5)})
y = pd.DataFrame(

Check warning on line 1112 in evalml/tests/conftest.py

View check run for this annotation

Codecov / codecov/patch

evalml/tests/conftest.py#L1112

Added line #L1112 was not covered by tests
{
f"target{MULTISERIES_SEPARATOR_SYMBOL}{i}": range(i, 100, 5)
for i in range(5)
},
)

X["date"] = pd.date_range(start="1/1/2018", periods=20)
return X, y
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,9 @@ def test_multiseries_pipeline_fit(
assert pipeline.frequency is not None


@pytest.mark.parametrize("include_series_id", [True, False])

Check warning on line 93 in evalml/tests/pipeline_tests/regression_pipeline_tests/test_multiseries_regression_pipeline.py

View check run for this annotation

Codecov / codecov/patch

evalml/tests/pipeline_tests/regression_pipeline_tests/test_multiseries_regression_pipeline.py#L93

Added line #L93 was not covered by tests
def test_multiseries_pipeline_predict_in_sample(
include_series_id,
multiseries_ts_data_stacked,
component_graph,
pipeline_parameters,
Expand All @@ -111,14 +113,19 @@ def test_multiseries_pipeline_predict_in_sample(
y_holdout,
X_train=X_train,
y_train=y_train,
include_series_id=include_series_id,
)
expected = pd.Series(
range(55, 65),
index=range(90, 100),
name="target",
dtype="float64",
)
pd.testing.assert_series_equal(y_pred, expected)
if include_series_id:
expected = pd.concat([X_holdout["series_id"], expected], axis=1)
pd.testing.assert_frame_equal(y_pred, expected)

Check warning on line 126 in evalml/tests/pipeline_tests/regression_pipeline_tests/test_multiseries_regression_pipeline.py

View check run for this annotation

Codecov / codecov/patch

evalml/tests/pipeline_tests/regression_pipeline_tests/test_multiseries_regression_pipeline.py#L124-L126

Added lines #L124 - L126 were not covered by tests
else:
pd.testing.assert_series_equal(y_pred, expected)

Check warning on line 128 in evalml/tests/pipeline_tests/regression_pipeline_tests/test_multiseries_regression_pipeline.py

View check run for this annotation

Codecov / codecov/patch

evalml/tests/pipeline_tests/regression_pipeline_tests/test_multiseries_regression_pipeline.py#L128

Added line #L128 was not covered by tests


@pytest.mark.parametrize("forecast_horizon", [1, 7])
Expand Down
4 changes: 3 additions & 1 deletion evalml/tests/pipeline_tests/test_pipeline_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@
handle_component_class,
)
from evalml.pipelines.utils import (
MULTISERIES_SEPARATOR_SYMBOL,
_get_pipeline_base_class,
_get_preprocessing_components,
_make_pipeline_from_multiple_graphs,
Expand Down Expand Up @@ -1404,7 +1405,8 @@ def test_unstack_multiseries(
X_unstacked, y_unstacked = multiseries_ts_data_unstacked
y.name = target_name
y_unstacked.columns = [
f"{target_name}|{i}" for i in range(len(y_unstacked.columns))
f"{target_name}{MULTISERIES_SEPARATOR_SYMBOL}{i}"
for i in range(len(y_unstacked.columns))
]

X_unstacked_transformed, y_unstacked_transformed = unstack_multiseries(
Expand Down
12 changes: 0 additions & 12 deletions evalml/utils/woodwork_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -106,20 +106,8 @@ def _schema_is_equal(first, other):
Returns:
bool: Whether or not the two schemas are equal
"""
# first_types_index = first.types.index.tolist()
# other_types_index = other.types.index.tolist()
# first_types_index.sort()
# other_types_index.sort()
# if first_types_index != other_types_index:
if first.types.index.tolist() != other.types.index.tolist():
return False
# first_logical_types = first.types["Logical Type"].astype(str).tolist()
# other_logical_types = other.types["Logical Type"].astype(str).tolist()
# first_logical_types.sort()
# other_logical_types.sort()
# logical = [x if x != "Integer" else "Double" for x in first_logical_types] == [
# x if x != "Integer" else "Double" for x in other_logical_types
# ]
logical = [
x if x != "Integer" else "Double"
for x in first.types["Logical Type"].astype(str).tolist()
Expand Down

0 comments on commit dd0346d

Please sign in to comment.