Consolidated separator symbols and added tests

alteryx · Oct 31, 2023 · dd0346d · dd0346d
1 parent c24a576
commit dd0346d
Show file tree

Hide file tree

Showing 8 changed files with 62 additions and 27 deletions.
diff --git a/evalml/pipelines/time_series_regression_pipeline.py b/evalml/pipelines/time_series_regression_pipeline.py
@@ -225,7 +225,11 @@ def _get_series_intervals(intervals, residuals, trend_pred_intervals, y):
                 return return_intervals
 
             if self.problem_type == ProblemTypes.MULTISERIES_TIME_SERIES_REGRESSION:
-                from evalml.pipelines.utils import stack_data, unstack_multiseries
+                from evalml.pipelines.utils import (
+                    MULTISERIES_SEPARATOR_SYMBOL,
+                    stack_data,
+                    unstack_multiseries,
+                )
 
                 X, y = unstack_multiseries(
                     X,
@@ -268,7 +272,9 @@ def _get_series_intervals(intervals, residuals, trend_pred_intervals, y):
                 # `pred_intervals` are in {series_id: {coverage_label: bound_value}} form
                 for series_id, series_intervals in pred_intervals.items():
                     series_id_target_name = (
-                        self.input_target_name + "|" + str(series_id)
+                        self.input_target_name
+                        + MULTISERIES_SEPARATOR_SYMBOL
+                        + str(series_id)
                     )
                     series_id_prediction_intervals = _get_series_intervals(
                         series_intervals,

diff --git a/evalml/pipelines/utils.py b/evalml/pipelines/utils.py
@@ -76,6 +76,7 @@
 from evalml.utils.gen_utils import contains_all_ts_parameters
 
 DECOMPOSER_PERIOD_CAP = 1000
+MULTISERIES_SEPARATOR_SYMBOL = "|"
 
 
 def _get_label_encoder(X, y, problem_type, estimator_class, sampler_name=None):
@@ -1418,7 +1419,7 @@ def unstack_multiseries(
         for column_name in full_dataset.columns.drop([time_index, series_id]):
             new_column = single_series[column_name]
             new_column.index = new_time_index
-            new_column.name = f"{column_name}|{s_id}"
+            new_column.name = f"{column_name}{MULTISERIES_SEPARATOR_SYMBOL}{s_id}"
 
             if column_name == target_name:
                 y_unstacked_cols.append(new_column)
@@ -1468,7 +1469,9 @@ def stack_data(
 
     # Extract the original column name
     series_id_with_name = stacked_series.index.droplevel()
-    stacked_series.name = "".join(series_id_with_name[0].split("|")[:-1])
+    stacked_series.name = "".join(
+        series_id_with_name[0].split(MULTISERIES_SEPARATOR_SYMBOL)[:-1],
+    )
 
     # If the index is the time index, keep it
     if not data.index.is_numeric() and starting_index is None:
@@ -1485,7 +1488,9 @@ def stack_data(
     # Pull out the series id information, if requested
     if include_series_id:
         series_id_col = pd.Series(
-            series_id_with_name.map(lambda col_name: col_name.split("|")[-1]),
+            series_id_with_name.map(
+                lambda col_name: col_name.split(MULTISERIES_SEPARATOR_SYMBOL)[-1],
+            ),
             name=series_id_name or "series_id",
             index=stacked_series.index,
         )
@@ -1516,8 +1521,8 @@ def stack_X(X, series_id_name, time_index, starting_index=None, series_id_values
         for col in X.columns:
             if col == time_index:
                 continue
-            separated_name = col.split("|")
-            original_columns.add("".join(separated_name[:-1]))
+            separated_name = col.split(MULTISERIES_SEPARATOR_SYMBOL)
+            original_columns.add(MULTISERIES_SEPARATOR_SYMBOL.join(separated_name[:-1]))
             series_ids.add(separated_name[-1])
 
     if len(series_ids) == 0:

diff --git a/evalml/tests/component_tests/test_time_series_featurizer.py b/evalml/tests/component_tests/test_time_series_featurizer.py
@@ -15,6 +15,7 @@
 )
 
 from evalml.pipelines import TimeSeriesFeaturizer
+from evalml.pipelines.utils import MULTISERIES_SEPARATOR_SYMBOL
 
 ROLLING_TRANSFORM_METHOD_NAME = "_compute_rolling_transforms"
 DELAYED_FEATURES_METHOD_NAME = "_compute_delays"
@@ -991,7 +992,9 @@ def test_featurizer_y_dataframe(multiseries_ts_data_unstacked):
 
     assert featurizer.statistically_significant_lags == [6]
 
-    expected_y_cols = [f"target|{i}_delay_6" for i in range(y.shape[1])]
+    expected_y_cols = [
+        f"target{MULTISERIES_SEPARATOR_SYMBOL}{i}_delay_6" for i in range(y.shape[1])
+    ]
     X_t = featurizer.transform(X, y)
     for expected_y_col in expected_y_cols:
         assert expected_y_col in X_t.columns
diff --git a/evalml/tests/component_tests/test_time_series_imputer.py b/evalml/tests/component_tests/test_time_series_imputer.py
@@ -11,6 +11,7 @@
 )
 
 from evalml.pipelines.components import TimeSeriesImputer
+from evalml.pipelines.utils import MULTISERIES_SEPARATOR_SYMBOL
 
 
 def test_invalid_strategy_parameters():
@@ -745,7 +746,12 @@ def test_time_series_imputer_multiseries(
     _, y_imputed = imputer.transform(X, y)
     assert isinstance(y_imputed, pd.DataFrame)
 
-    y_expected = pd.DataFrame({f"target|{i}": range(i, 100, 5) for i in range(5)})
+    y_expected = pd.DataFrame(
+        {
+            f"target{MULTISERIES_SEPARATOR_SYMBOL}{i}": range(i, 100, 5)
+            for i in range(5)
+        },
+    )
     assert_frame_equal(y_imputed, y_expected, check_dtype=False)
 
 
@@ -777,7 +783,10 @@ def test_time_series_imputer_multiseries_some_columns_all_nan(
     _, y_imputed = imputer.transform(X, y)
 
     y_expected = pd.DataFrame(
-        {f"target|{i}": range(i, 100, 5) for i in range(num_nan_cols, 5)},
+        {
+            f"target{MULTISERIES_SEPARATOR_SYMBOL}{i}": range(i, 100, 5)
+            for i in range(num_nan_cols, 5)
+        },
     )
     assert_frame_equal(y_imputed, y_expected, check_dtype=False)
 

diff --git a/evalml/tests/conftest.py b/evalml/tests/conftest.py
@@ -1094,12 +1094,27 @@ def multiseries_ts_data_stacked():
 
 @pytest.fixture
 def multiseries_ts_data_unstacked():
-    feature_a = pd.DataFrame({f"feature_a|{i}": range(i, 100, 5) for i in range(5)})
+    from evalml.pipelines.utils import MULTISERIES_SEPARATOR_SYMBOL
+
+    feature_a = pd.DataFrame(
+        {
+            f"feature_a{MULTISERIES_SEPARATOR_SYMBOL}{i}": range(i, 100, 5)
+            for i in range(5)
+        },
+    )
     feature_b = pd.DataFrame(
-        {f"feature_b|{i}": range(99 - i, -1, -5) for i in range(5)},
+        {
+            f"feature_b{MULTISERIES_SEPARATOR_SYMBOL}{i}": range(99 - i, -1, -5)
+            for i in range(5)
+        },
     )
     X = pd.concat([feature_a, feature_b], axis=1)
-    y = pd.DataFrame({f"target|{i}": range(i, 100, 5) for i in range(5)})
+    y = pd.DataFrame(
+        {
+            f"target{MULTISERIES_SEPARATOR_SYMBOL}{i}": range(i, 100, 5)
+            for i in range(5)
+        },
+    )
 
     X["date"] = pd.date_range(start="1/1/2018", periods=20)
     return X, y

diff --git a/...ml/tests/pipeline_tests/regression_pipeline_tests/test_multiseries_regression_pipeline.py b/...ml/tests/pipeline_tests/regression_pipeline_tests/test_multiseries_regression_pipeline.py
@@ -90,7 +90,9 @@ def test_multiseries_pipeline_fit(
     assert pipeline.frequency is not None
 
 
+@pytest.mark.parametrize("include_series_id", [True, False])
 def test_multiseries_pipeline_predict_in_sample(
+    include_series_id,
     multiseries_ts_data_stacked,
     component_graph,
     pipeline_parameters,
@@ -111,14 +113,19 @@ def test_multiseries_pipeline_predict_in_sample(
         y_holdout,
         X_train=X_train,
         y_train=y_train,
+        include_series_id=include_series_id,
     )
     expected = pd.Series(
         range(55, 65),
         index=range(90, 100),
         name="target",
         dtype="float64",
     )
-    pd.testing.assert_series_equal(y_pred, expected)
+    if include_series_id:
+        expected = pd.concat([X_holdout["series_id"], expected], axis=1)
+        pd.testing.assert_frame_equal(y_pred, expected)
+    else:
+        pd.testing.assert_series_equal(y_pred, expected)
 
 
 @pytest.mark.parametrize("forecast_horizon", [1, 7])

diff --git a/evalml/tests/pipeline_tests/test_pipeline_utils.py b/evalml/tests/pipeline_tests/test_pipeline_utils.py
@@ -43,6 +43,7 @@
     handle_component_class,
 )
 from evalml.pipelines.utils import (
+    MULTISERIES_SEPARATOR_SYMBOL,
     _get_pipeline_base_class,
     _get_preprocessing_components,
     _make_pipeline_from_multiple_graphs,
@@ -1404,7 +1405,8 @@ def test_unstack_multiseries(
     X_unstacked, y_unstacked = multiseries_ts_data_unstacked
     y.name = target_name
     y_unstacked.columns = [
-        f"{target_name}|{i}" for i in range(len(y_unstacked.columns))
+        f"{target_name}{MULTISERIES_SEPARATOR_SYMBOL}{i}"
+        for i in range(len(y_unstacked.columns))
     ]
 
     X_unstacked_transformed, y_unstacked_transformed = unstack_multiseries(

diff --git a/evalml/utils/woodwork_utils.py b/evalml/utils/woodwork_utils.py
@@ -106,20 +106,8 @@ def _schema_is_equal(first, other):
     Returns:
         bool: Whether or not the two schemas are equal
     """
-    # first_types_index = first.types.index.tolist()
-    # other_types_index = other.types.index.tolist()
-    # first_types_index.sort()
-    # other_types_index.sort()
-    # if first_types_index != other_types_index:
     if first.types.index.tolist() != other.types.index.tolist():
         return False
-    # first_logical_types = first.types["Logical Type"].astype(str).tolist()
-    # other_logical_types = other.types["Logical Type"].astype(str).tolist()
-    # first_logical_types.sort()
-    # other_logical_types.sort()
-    # logical = [x if x != "Integer" else "Double" for x in first_logical_types] == [
-    #     x if x != "Integer" else "Double" for x in other_logical_types
-    # ]
     logical = [
         x if x != "Integer" else "Double"
         for x in first.types["Logical Type"].astype(str).tolist()