From 3907c7ada8c63783a666adfd978df4be761cc971 Mon Sep 17 00:00:00 2001 From: MichaelFu512 Date: Tue, 5 Sep 2023 14:44:45 -0700 Subject: [PATCH 01/11] datetime format extended for multiseries --- .../data_checks/datetime_format_data_check.py | 309 +++++++++--------- 1 file changed, 160 insertions(+), 149 deletions(-) diff --git a/evalml/data_checks/datetime_format_data_check.py b/evalml/data_checks/datetime_format_data_check.py index a3c9af8082..9e2d890ce4 100644 --- a/evalml/data_checks/datetime_format_data_check.py +++ b/evalml/data_checks/datetime_format_data_check.py @@ -16,22 +16,33 @@ class DateTimeFormatDataCheck(DataCheck): """Check if the datetime column has equally spaced intervals and is monotonically increasing or decreasing in order to be supported by time series estimators. + If used for multiseries problem, works specifically on stacked datasets + Args: datetime_column (str, int): The name of the datetime column. If the datetime values are in the index, then pass "index". nan_duplicate_threshold (float): The percentage of values in the `datetime_column` that must not be duplicate or nan before `DATETIME_NO_FREQUENCY_INFERRED` is returned instead of `DATETIME_HAS_UNEVEN_INTERVALS`. For example, if this is set to 0.80, then only 20% of the values in `datetime_column` can be duplicate or nan. Defaults to 0.75. + series_id (str): The name of the series_id column for multiseries. Defaults to None """ - def __init__(self, datetime_column="index", nan_duplicate_threshold=0.75): + def __init__( + self, + datetime_column="index", + nan_duplicate_threshold=0.75, + series_id=None, + ): self.datetime_column = datetime_column self.nan_duplicate_threshold = nan_duplicate_threshold + self.series_id = series_id def validate(self, X, y): """Checks if the target data has equal intervals and is monotonically increasing. - Will return a DataCheckError if the data is not a datetime type, is not increasing, has redundant or missing row(s), + Will return DataCheckError(s) if the data is not a datetime type, is not increasing, has redundant or missing row(s), contains invalid (NaN or None) values, or has values that don't align with the assumed frequency. + If used for multiseries problem, works specifically on stacked datasets + Args: X (pd.DataFrame, np.ndarray): Features. y (pd.Series, np.ndarray): Target data. @@ -199,52 +210,6 @@ def validate(self, X, y): ... } ... ] - The column "Weeks" has a date that does not follow the weekly pattern, which is considered misaligned. - - >>> X = pd.DataFrame(pd.date_range("2021-01-01", freq="W", periods=12).append(pd.date_range("2021-03-22", periods=1)), columns=["Weeks"]) - >>> ww_payload = infer_frequency(X["Weeks"], debug=True, window_length=5, threshold=0.8) - >>> datetime_format_dc = DateTimeFormatDataCheck(datetime_column="Weeks") - >>> assert datetime_format_dc.validate(X, y) == [ - ... { - ... "message": "Column 'Weeks' has datetime values that do not align with the inferred frequency.", - ... "data_check_name": "DateTimeFormatDataCheck", - ... "level": "error", - ... "details": {"columns": None, "rows": None}, - ... "code": "DATETIME_HAS_MISALIGNED_VALUES", - ... "action_options": [] - ... }, - ... { - ... "message": "A frequency was detected in column 'Weeks', but there are faulty datetime values that need to be addressed.", - ... "data_check_name": "DateTimeFormatDataCheck", - ... "level": "error", - ... "code": "DATETIME_HAS_UNEVEN_INTERVALS", - ... "details": {'columns': None, 'rows': None}, - ... "action_options": [ - ... { - ... 'code': 'REGULARIZE_AND_IMPUTE_DATASET', - ... 'data_check_name': 'DateTimeFormatDataCheck', - ... 'metadata': { - ... 'columns': None, - ... 'is_target': True, - ... 'rows': None - ... }, - ... 'parameters': { - ... 'time_index': { - ... 'default_value': 'Weeks', - ... 'parameter_type': 'global', - ... 'type': 'str' - ... }, - ... 'frequency_payload': { - ... 'default_value': ww_payload, - ... 'parameter_type': 'global', - ... 'type': 'tuple' - ... } - ... } - ... } - ... ] - ... } - ... ] - The column "Weeks" passed integers instead of datetime data, which will raise an error. >>> X = pd.DataFrame([1, 2, 3, 4], columns=["Weeks"]) @@ -351,7 +316,7 @@ def validate(self, X, y): X = infer_feature_types(X) y = infer_feature_types(y) - + is_multiseries = self.series_id is not None no_dt_found = False if self.datetime_column != "index": @@ -367,7 +332,10 @@ def validate(self, X, y): inferred_freq = pd.infer_freq(datetime_values) except TypeError: no_dt_found = True - + if self.series_id is not None and self.series_id not in X: + raise ValueError( + f"""series_id "{self.series_id}" is not in the dataset.""", + ) if no_dt_found: messages.append( DataCheckError( @@ -378,110 +346,153 @@ def validate(self, X, y): ) return messages - # Check if the data is monotonically increasing - no_nan_datetime_values = datetime_values.dropna() - if not pd.DatetimeIndex(no_nan_datetime_values).is_monotonic_increasing: - messages.append( - DataCheckError( - message="Datetime values must be sorted in ascending order.", - data_check_name=self.name, - message_code=DataCheckMessageCode.DATETIME_IS_NOT_MONOTONIC, - ).to_dict(), - ) - - col_name = ( - self.datetime_column if self.datetime_column != "index" else "either index" - ) - - ww_payload = infer_frequency( - pd.Series(datetime_values), - debug=True, - window_length=4, - threshold=0.4, + series_datetime = ( + [datetime_values] if self.series_id is None else X[self.series_id].unique() ) - inferred_freq = ww_payload[0] - debug_object = ww_payload[1] - if inferred_freq is not None: - return messages - - # Check for NaN values - if len(debug_object["nan_values"]) > 0: - messages.append( - DataCheckError( - message=f"Input datetime column '{col_name}' contains NaN values. Please impute NaN values or drop these rows.", - data_check_name=self.name, - message_code=DataCheckMessageCode.DATETIME_HAS_NAN, - ).to_dict(), - ) - - # Check for only one row per datetime - if len(debug_object["duplicate_values"]) > 0: - messages.append( - DataCheckError( - message=f"Column '{col_name}' has more than one row with the same datetime value.", - data_check_name=self.name, - message_code=DataCheckMessageCode.DATETIME_HAS_REDUNDANT_ROW, - ).to_dict(), + for series in series_datetime: + + # if multiseries only select the datetimes corresponding to one series + if is_multiseries: + if self.datetime_column != "index": + datetime_values = X[X[self.series_id] == series][ + self.datetime_column + ] + else: + datetime_values = X[X[self.series_id] == series].index + + # Check if the data is monotonically increasing + no_nan_datetime_values = datetime_values.dropna() + if not pd.DatetimeIndex(no_nan_datetime_values).is_monotonic_increasing: + messages.append( + DataCheckError( + message="Datetime values must be sorted in ascending order.", + data_check_name=self.name, + message_code=DataCheckMessageCode.DATETIME_IS_NOT_MONOTONIC, + ).to_dict(), + ) + + col_name = ( + self.datetime_column + if self.datetime_column != "index" + else "either index" ) - # Check for no date missing in ordered dates - if len(debug_object["missing_values"]) > 0: - messages.append( - DataCheckError( - message=f"Column '{col_name}' has datetime values missing between start and end date.", - data_check_name=self.name, - message_code=DataCheckMessageCode.DATETIME_IS_MISSING_VALUES, - ).to_dict(), + ww_payload = infer_frequency( + pd.Series(datetime_values), + debug=True, + window_length=4, + threshold=0.4, ) - - # Check for dates that don't line up with the frequency - if len(debug_object["extra_values"]) > 0: - messages.append( - DataCheckError( - message=f"Column '{col_name}' has datetime values that do not align with the inferred frequency.", - data_check_name=self.name, - message_code=DataCheckMessageCode.DATETIME_HAS_MISALIGNED_VALUES, - ).to_dict(), + inferred_freq = ww_payload[0] + debug_object = ww_payload[1] + if inferred_freq is not None: + if is_multiseries: + continue + else: + return messages + + # Check for NaN values + if len(debug_object["nan_values"]) > 0: + series_message = f"Input datetime column '{col_name}' for series '{series}' contains NaN values. Please impute NaN values or drop these rows." + messages.append( + DataCheckError( + message=f"Input datetime column '{col_name}' contains NaN values. Please impute NaN values or drop these rows." + if not is_multiseries + else series_message, + data_check_name=self.name, + message_code=DataCheckMessageCode.DATETIME_HAS_NAN, + ).to_dict(), + ) + + # Check for only one row per datetime + if len(debug_object["duplicate_values"]) > 0: + series_message = f"Column '{col_name}' for series '{series}' has more than one row with the same datetime value." + messages.append( + DataCheckError( + message=f"Column '{col_name}' has more than one row with the same datetime value." + if not is_multiseries + else series_message, + data_check_name=self.name, + message_code=DataCheckMessageCode.DATETIME_HAS_REDUNDANT_ROW, + ).to_dict(), + ) + + # Check for no date missing in ordered dates + if len(debug_object["missing_values"]) > 0: + series_message = f"Column '{col_name}' for series '{series}' has datetime values missing between start and end date." + messages.append( + DataCheckError( + message=f"Column '{col_name}' has datetime values missing between start and end date." + if not is_multiseries + else series_message, + data_check_name=self.name, + message_code=DataCheckMessageCode.DATETIME_IS_MISSING_VALUES, + ).to_dict(), + ) + + # Check for dates that don't line up with the frequency + if len(debug_object["extra_values"]) > 0: + series_message = ( + f"Column '{col_name}' for series '{series}' has datetime values that do not align with the inferred frequency.", + ) + messages.append( + DataCheckError( + message=f"Column '{col_name}' has datetime values that do not align with the inferred frequency." + if not is_multiseries + else series_message, + data_check_name=self.name, + message_code=DataCheckMessageCode.DATETIME_HAS_MISALIGNED_VALUES, + ).to_dict(), + ) + + datetime_values_no_nans_duplicates = ( + no_nan_datetime_values.drop_duplicates() ) - - datetime_values_no_nans_duplicates = no_nan_datetime_values.drop_duplicates() - # Give a generic uneven interval error no frequency can be estimated by woodwork - if debug_object["estimated_freq"] is None or len( - datetime_values_no_nans_duplicates, - ) <= self.nan_duplicate_threshold * len(datetime_values): - messages.append( - DataCheckError( - message=f"No frequency could be detected in column '{col_name}', possibly due to uneven intervals or too many duplicate/missing values.", - data_check_name=self.name, - message_code=DataCheckMessageCode.DATETIME_NO_FREQUENCY_INFERRED, - ).to_dict(), - ) - else: - messages.append( - DataCheckError( - message=f"A frequency was detected in column '{col_name}', but there are faulty datetime values that need to be addressed.", - data_check_name=self.name, - message_code=DataCheckMessageCode.DATETIME_HAS_UNEVEN_INTERVALS, - action_options=[ - DataCheckActionOption( - DataCheckActionCode.REGULARIZE_AND_IMPUTE_DATASET, - data_check_name=self.name, - parameters={ - "time_index": { - "parameter_type": DCAOParameterType.GLOBAL, - "type": "str", - "default_value": col_name, - }, - "frequency_payload": { - "parameter_type": DCAOParameterType.GLOBAL, - "type": "tuple", - "default_value": ww_payload, + # Give a generic uneven interval error no frequency can be estimated by woodwork + if debug_object["estimated_freq"] is None or len( + datetime_values_no_nans_duplicates, + ) <= self.nan_duplicate_threshold * len(datetime_values): + series_message = f"No frequency could be detected in column '{col_name} for series '{series}', possibly due to uneven intervals or too many duplicate/missing values." + messages.append( + DataCheckError( + message=f"No frequency could be detected in column '{col_name}', possibly due to uneven intervals or too many duplicate/missing values." + if not is_multiseries + else series_message, + data_check_name=self.name, + message_code=DataCheckMessageCode.DATETIME_NO_FREQUENCY_INFERRED, + ).to_dict(), + ) + else: + series_message = ( + f"A frequency was detected in column '{col_name}' for series '{series}', but there are faulty datetime values that need to be addressed.", + ) + messages.append( + DataCheckError( + message=f"A frequency was detected in column '{col_name}', but there are faulty datetime values that need to be addressed." + if not is_multiseries + else series_message, + data_check_name=self.name, + message_code=DataCheckMessageCode.DATETIME_HAS_UNEVEN_INTERVALS, + action_options=[ + DataCheckActionOption( + DataCheckActionCode.REGULARIZE_AND_IMPUTE_DATASET, + data_check_name=self.name, + parameters={ + "time_index": { + "parameter_type": DCAOParameterType.GLOBAL, + "type": "str", + "default_value": col_name, + }, + "frequency_payload": { + "parameter_type": DCAOParameterType.GLOBAL, + "type": "tuple", + "default_value": ww_payload, + }, }, - }, - metadata={"is_target": True}, - ), - ], - ).to_dict(), - ) + metadata={"is_target": True}, + ), + ], + ).to_dict(), + ) return messages From 586fe315e6820c879833f5603c5eebfb33aa2a0c Mon Sep 17 00:00:00 2001 From: MichaelFu512 Date: Wed, 6 Sep 2023 19:08:41 -0700 Subject: [PATCH 02/11] added initial test --- .../data_checks/datetime_format_data_check.py | 19 ++- .../test_datetime_format_data_check.py | 145 +++++++++++++++++- 2 files changed, 152 insertions(+), 12 deletions(-) diff --git a/evalml/data_checks/datetime_format_data_check.py b/evalml/data_checks/datetime_format_data_check.py index 9e2d890ce4..f30a2e1251 100644 --- a/evalml/data_checks/datetime_format_data_check.py +++ b/evalml/data_checks/datetime_format_data_check.py @@ -319,6 +319,11 @@ def validate(self, X, y): is_multiseries = self.series_id is not None no_dt_found = False + if self.series_id is not None and self.series_id not in X: + raise ValueError( + f"""series_id "{self.series_id}" is not in the dataset.""", + ) + if self.datetime_column != "index": datetime_values = X[self.datetime_column] else: @@ -332,10 +337,7 @@ def validate(self, X, y): inferred_freq = pd.infer_freq(datetime_values) except TypeError: no_dt_found = True - if self.series_id is not None and self.series_id not in X: - raise ValueError( - f"""series_id "{self.series_id}" is not in the dataset.""", - ) + if no_dt_found: messages.append( DataCheckError( @@ -356,7 +358,7 @@ def validate(self, X, y): if self.datetime_column != "index": datetime_values = X[X[self.series_id] == series][ self.datetime_column - ] + ].reset_index(drop=True) else: datetime_values = X[X[self.series_id] == series].index @@ -376,7 +378,6 @@ def validate(self, X, y): if self.datetime_column != "index" else "either index" ) - ww_payload = infer_frequency( pd.Series(datetime_values), debug=True, @@ -452,7 +453,7 @@ def validate(self, X, y): if debug_object["estimated_freq"] is None or len( datetime_values_no_nans_duplicates, ) <= self.nan_duplicate_threshold * len(datetime_values): - series_message = f"No frequency could be detected in column '{col_name} for series '{series}', possibly due to uneven intervals or too many duplicate/missing values." + series_message = f"No frequency could be detected in column '{col_name}' for series '{series}', possibly due to uneven intervals or too many duplicate/missing values." messages.append( DataCheckError( message=f"No frequency could be detected in column '{col_name}', possibly due to uneven intervals or too many duplicate/missing values." @@ -463,9 +464,7 @@ def validate(self, X, y): ).to_dict(), ) else: - series_message = ( - f"A frequency was detected in column '{col_name}' for series '{series}', but there are faulty datetime values that need to be addressed.", - ) + series_message = f"A frequency was detected in column '{col_name}' for series '{series}', but there are faulty datetime values that need to be addressed." messages.append( DataCheckError( message=f"A frequency was detected in column '{col_name}', but there are faulty datetime values that need to be addressed." diff --git a/evalml/tests/data_checks_tests/test_datetime_format_data_check.py b/evalml/tests/data_checks_tests/test_datetime_format_data_check.py index 4d864a6eb5..56c8c1afbe 100644 --- a/evalml/tests/data_checks_tests/test_datetime_format_data_check.py +++ b/evalml/tests/data_checks_tests/test_datetime_format_data_check.py @@ -18,9 +18,12 @@ THRESHOLD = 0.4 -def get_uneven_error(col_name, ww_payload): +def get_uneven_error(col_name, ww_payload, series=None): + series_message = f"A frequency was detected in column '{col_name}' for series '{series}', but there are faulty datetime values that need to be addressed." error = DataCheckError( - message=f"A frequency was detected in column '{col_name}', but there are faulty datetime values that need to be addressed.", + message=f"A frequency was detected in column '{col_name}', but there are faulty datetime values that need to be addressed." + if series is None + else series_message, data_check_name=datetime_format_check_name, message_code=DataCheckMessageCode.DATETIME_HAS_UNEVEN_INTERVALS, action_options=[ @@ -541,3 +544,141 @@ def test_datetime_many_duplicates_and_nans(): result = dc.validate(X, y) assert result[2]["code"] == "DATETIME_NO_FREQUENCY_INFERRED" + + +@pytest.mark.parametrize("input_type", ["pd", "ww"]) +@pytest.mark.parametrize( + "issue", + ["redundant", "missing", "uneven", "type_errors", None], +) +@pytest.mark.parametrize("datetime_loc", [1, "X_index"]) +def test_datetime_format_data_check_typeerror_uneven_intervals_multiseries( + issue, + input_type, + datetime_loc, +): + # there's 60 entries in the dataframe (30 per series) + X, y = pd.DataFrame({"features": range(60)}), pd.Series(range(60)) + X["series_id"] = pd.Series(list(range(2)) * 30, dtype="str") + + if issue == "type_errors": + dates = range(60) + else: + dates = pd.date_range("2021-01-01", periods=60) + + if issue == "missing": + # Skips 2021-01-25 and starts again at 2021-01-27, skipping a date and triggering the error + dates = (pd.date_range("2021-01-01", periods=25).repeat(2)).append( + (pd.date_range("2021-01-27", periods=5).repeat(2)), + ) + + if issue == "uneven": + dates_1 = pd.date_range("2015-01-01", periods=5, freq="D").repeat(2) + dates_2 = pd.date_range("2015-01-08", periods=3, freq="D").repeat(2) + dates_3 = pd.DatetimeIndex(["2015-01-12"]).repeat(2) + dates_4 = pd.date_range("2015-01-15", periods=5, freq="D").repeat(2) + dates_5 = pd.date_range("2015-01-22", periods=5, freq="D").repeat(2) + dates_6 = pd.date_range("2015-01-29", periods=11, freq="M").repeat(2) + + dates = ( + dates_1.append(dates_2) + .append(dates_3) + .append(dates_4) + .append(dates_5) + .append(dates_6) + ) + if issue == "redundant": + # 2021-01-25 repeats twice which triggers an error + dates = (pd.date_range("2021-01-01", periods=25).repeat(2)).append( + (pd.date_range("2021-01-25", periods=5).repeat(2)), + ) + datetime_column = "index" + + if datetime_loc == 1: + X[datetime_loc] = dates + datetime_column = datetime_loc + else: + X.index = dates + + if input_type == "ww": + X.ww.init() + y.ww.init() + + datetime_format_check = DateTimeFormatDataCheck( + datetime_column=datetime_column, + series_id="series_id", + ) + + messages = [] + for series in X["series_id"].unique(): + if issue == "type_errors": + # type error only has 1 message regardless of how many series there are + if len(messages) == 0: + messages.append( + DataCheckError( + message="Datetime information could not be found in the data, or was not in a supported datetime format.", + data_check_name=datetime_format_check_name, + message_code=DataCheckMessageCode.DATETIME_INFORMATION_NOT_FOUND, + ).to_dict(), + ) + else: + # separates the datetimes so it only displays the dates that correspond to the current series + if input_type == "ww": + # ww makes the series_id into ints so need to cast + if datetime_loc == "X_index": + dates = pd.Series( + X[X[datetime_format_check.series_id] == int(series)].index, + ) + else: + dates = X[X[datetime_format_check.series_id] == int(series)][ + datetime_column + ] + elif datetime_loc == "X_index": + dates = pd.Series(X[X[datetime_format_check.series_id] == series].index) + else: + dates = X[X[datetime_format_check.series_id] == series][datetime_column] + + ww_payload = infer_frequency( + dates.reset_index(drop=True), + debug=True, + window_length=WINDOW_LENGTH, + threshold=THRESHOLD, + ) + + col_name = datetime_loc if datetime_loc == 1 else "either index" + if issue is None: + break + elif issue == "missing": + messages.append( + DataCheckError( + message=f"Column '{col_name}' for series '{series}' has datetime values missing between start and end date.", + data_check_name=datetime_format_check_name, + message_code=DataCheckMessageCode.DATETIME_IS_MISSING_VALUES, + ).to_dict(), + ) + messages.append( + get_uneven_error(col_name, ww_payload, series), + ) + elif issue == "redundant": + messages.append( + DataCheckError( + message=f"Column '{col_name}' for series '{series}' has more than one row with the same datetime value.", + data_check_name=datetime_format_check_name, + message_code=DataCheckMessageCode.DATETIME_HAS_REDUNDANT_ROW, + ).to_dict(), + ) + messages.append( + get_uneven_error(col_name, ww_payload, series), + ) + else: + messages.append( + DataCheckError( + message=f"No frequency could be detected in column '{col_name}' for series '{series}', possibly due to uneven intervals or too many duplicate/missing values.", + data_check_name=datetime_format_check_name, + message_code=DataCheckMessageCode.DATETIME_NO_FREQUENCY_INFERRED, + ).to_dict(), + ) + if issue is None: + assert datetime_format_check.validate(X, y) == [] + else: + assert datetime_format_check.validate(X, y) == messages From b591717103b9fe03c4d4585bf54fec6f2c045a8b Mon Sep 17 00:00:00 2001 From: MichaelFu512 Date: Thu, 7 Sep 2023 12:24:50 -0700 Subject: [PATCH 03/11] more tests --- .../data_checks/datetime_format_data_check.py | 5 +- .../test_datetime_format_data_check.py | 213 ++++++++++++++---- 2 files changed, 167 insertions(+), 51 deletions(-) diff --git a/evalml/data_checks/datetime_format_data_check.py b/evalml/data_checks/datetime_format_data_check.py index f30a2e1251..68fb9ab2d8 100644 --- a/evalml/data_checks/datetime_format_data_check.py +++ b/evalml/data_checks/datetime_format_data_check.py @@ -352,7 +352,6 @@ def validate(self, X, y): [datetime_values] if self.series_id is None else X[self.series_id].unique() ) for series in series_datetime: - # if multiseries only select the datetimes corresponding to one series if is_multiseries: if self.datetime_column != "index": @@ -433,9 +432,7 @@ def validate(self, X, y): # Check for dates that don't line up with the frequency if len(debug_object["extra_values"]) > 0: - series_message = ( - f"Column '{col_name}' for series '{series}' has datetime values that do not align with the inferred frequency.", - ) + series_message = f"Column '{col_name}' for series '{series}' has datetime values that do not align with the inferred frequency." messages.append( DataCheckError( message=f"Column '{col_name}' has datetime values that do not align with the inferred frequency." diff --git a/evalml/tests/data_checks_tests/test_datetime_format_data_check.py b/evalml/tests/data_checks_tests/test_datetime_format_data_check.py index 56c8c1afbe..9161a4ca17 100644 --- a/evalml/tests/data_checks_tests/test_datetime_format_data_check.py +++ b/evalml/tests/data_checks_tests/test_datetime_format_data_check.py @@ -352,35 +352,6 @@ def test_datetime_format_data_check_multiple_errors(): get_uneven_error("dates", ww_payload), ] - dates = ( - pd.date_range("2021-01-01", periods=15, freq="2D") - .drop("2021-01-13") - .append(pd.date_range("2021-01-30", periods=1)) - .append(pd.date_range("2021-01-31", periods=86, freq="2D")) - ) - X = pd.DataFrame({"dates": dates}, dtype="datetime64[ns]") - - ww_payload = infer_frequency( - X["dates"], - debug=True, - window_length=WINDOW_LENGTH, - threshold=THRESHOLD, - ) - - assert datetime_format_check.validate(X, y) == [ - DataCheckError( - message="Column 'dates' has datetime values missing between start and end date.", - data_check_name=datetime_format_check_name, - message_code=DataCheckMessageCode.DATETIME_IS_MISSING_VALUES, - ).to_dict(), - DataCheckError( - message="Column 'dates' has datetime values that do not align with the inferred frequency.", - data_check_name=datetime_format_check_name, - message_code=DataCheckMessageCode.DATETIME_HAS_MISALIGNED_VALUES, - ).to_dict(), - get_uneven_error("dates", ww_payload), - ] - def test_datetime_format_unusual_interval(): dates = pd.date_range(start="2021-01-01", periods=100, freq="4D") @@ -649,26 +620,26 @@ def test_datetime_format_data_check_typeerror_uneven_intervals_multiseries( if issue is None: break elif issue == "missing": - messages.append( - DataCheckError( - message=f"Column '{col_name}' for series '{series}' has datetime values missing between start and end date.", - data_check_name=datetime_format_check_name, - message_code=DataCheckMessageCode.DATETIME_IS_MISSING_VALUES, - ).to_dict(), - ) - messages.append( - get_uneven_error(col_name, ww_payload, series), + messages.extend( + [ + DataCheckError( + message=f"Column '{col_name}' for series '{series}' has datetime values missing between start and end date.", + data_check_name=datetime_format_check_name, + message_code=DataCheckMessageCode.DATETIME_IS_MISSING_VALUES, + ).to_dict(), + get_uneven_error(col_name, ww_payload, series), + ], ) elif issue == "redundant": - messages.append( - DataCheckError( - message=f"Column '{col_name}' for series '{series}' has more than one row with the same datetime value.", - data_check_name=datetime_format_check_name, - message_code=DataCheckMessageCode.DATETIME_HAS_REDUNDANT_ROW, - ).to_dict(), - ) - messages.append( - get_uneven_error(col_name, ww_payload, series), + messages.extend( + [ + DataCheckError( + message=f"Column '{col_name}' for series '{series}' has more than one row with the same datetime value.", + data_check_name=datetime_format_check_name, + message_code=DataCheckMessageCode.DATETIME_HAS_REDUNDANT_ROW, + ).to_dict(), + get_uneven_error(col_name, ww_payload, series), + ], ) else: messages.append( @@ -682,3 +653,151 @@ def test_datetime_format_data_check_typeerror_uneven_intervals_multiseries( assert datetime_format_check.validate(X, y) == [] else: assert datetime_format_check.validate(X, y) == messages + + +def test_datetime_format_data_check_invalid_seriesid_multiseries( + multiseries_ts_data_stacked, +): + X, y = multiseries_ts_data_stacked + datetime_format_check = DateTimeFormatDataCheck( + datetime_column="Date", + series_id="not_series_id", + ) + with pytest.raises( + ValueError, + match="""series_id "not_series_id" is not in the dataset.""", + ): + datetime_format_check.validate(X, y) + + +@pytest.mark.parametrize("n_missing", [2, 5, 7]) +def test_datetime_format_data_check_multiple_missing_multiseries(n_missing): + X, y = pd.DataFrame({"features": range(100)}), pd.Series(range(100)) + X["series_id"] = pd.Series(list(range(2)) * 50, dtype="str") + + dates = pd.date_range("2021-01-01", periods=15).repeat(2) + if n_missing == 2: + # Two missing dates in separate spots + dates = dates.append(pd.date_range("2021-01-17", periods=36).repeat(2)).drop( + "2021-01-22", + ) + # print(dates) + # assert 1 == 0 + elif n_missing == 5: + # A chunk of 5 missing days in a row + dates = dates.append(pd.date_range("2021-01-21", periods=35).repeat(2)) + else: + # Some chunks missing and some alone missing + dates = dates.append(pd.date_range("2021-01-19", periods=39).repeat(2)).drop( + "2021-01-27", + ) + dates = dates.drop("2021-02-22") + dates = dates.drop("2021-01-11") + dates = dates.drop("2021-01-20") + + X["dates"] = dates + datetime_format_check = DateTimeFormatDataCheck( + datetime_column="dates", + series_id="series_id", + ) + + messages = [] + for series in X["series_id"].unique(): + ww_payload = infer_frequency( + X[X["series_id"] == series]["dates"].reset_index(drop=True), + debug=True, + window_length=WINDOW_LENGTH, + threshold=THRESHOLD, + ) + messages.extend( + [ + DataCheckError( + message=f"""Column 'dates' for series '{series}' has datetime values missing between start and end date.""", + data_check_name=datetime_format_check_name, + message_code=DataCheckMessageCode.DATETIME_IS_MISSING_VALUES, + ).to_dict(), + get_uneven_error("dates", ww_payload, series), + ], + ) + assert len(messages) == 4 + assert datetime_format_check.validate(X, y) == messages + + +@pytest.mark.parametrize("nans", [0, 1, 2]) +def test_datetime_format_data_check_nan_multiseries(nans): + dates = pd.Series(pd.date_range(start="2021-01-01", periods=20).repeat(2)) + if nans == 1: + dates[0] = np.NaN + elif nans == 2: + dates[0] = np.NaN + dates[1] = np.NaN + X = pd.DataFrame(dates, columns=["date"]) + X["series_id"] = pd.Series(list(range(2)) * 20, dtype="str") + + messages = [] + for series in X["series_id"].unique(): + ww_payload = infer_frequency( + X[X["series_id"] == series]["date"].reset_index(drop=True), + debug=True, + window_length=WINDOW_LENGTH, + threshold=THRESHOLD, + ) + if (series == "0" and nans >= 1) or (series == "1" and nans >= 2): + messages.extend( + [ + DataCheckError( + message=f"""Input datetime column 'date' for series '{series}' contains NaN values. Please impute NaN values or drop these rows.""", + data_check_name=DateTimeFormatDataCheck.name, + message_code=DataCheckMessageCode.DATETIME_HAS_NAN, + ).to_dict(), + get_uneven_error("date", ww_payload, series), + ], + ) + + dt_nan_check = DateTimeFormatDataCheck( + datetime_column="date", + series_id="series_id", + ) + assert dt_nan_check.validate(X, pd.Series(dtype="float64")) == messages + + +def test_datetime_format_data_check_multiseries_not_aligned_frequency(): + dates = ( + pd.date_range("2021-01-01", periods=15, freq="2D") + .repeat(2) + .drop("2021-01-13") + .append(pd.date_range("2021-01-30", periods=1).repeat(2)) + .append(pd.date_range("2021-01-31", periods=35, freq="2D").repeat(2)) + ) + X = pd.DataFrame({"dates": dates}, dtype="datetime64[ns]") + X["series_id"] = pd.Series(list(range(2)) * 50, dtype="str") + datetime_format_check = DateTimeFormatDataCheck( + datetime_column="dates", + series_id="series_id", + ) + + messages = [] + for series in X["series_id"].unique(): + ww_payload = infer_frequency( + X[X["series_id"] == series]["dates"].reset_index(drop=True), + debug=True, + window_length=WINDOW_LENGTH, + threshold=THRESHOLD, + ) + + messages.extend( + [ + DataCheckError( + message=f"""Column 'dates' for series '{series}' has datetime values missing between start and end date.""", + data_check_name=datetime_format_check_name, + message_code=DataCheckMessageCode.DATETIME_IS_MISSING_VALUES, + ).to_dict(), + DataCheckError( + message=f"""Column 'dates' for series '{series}' has datetime values that do not align with the inferred frequency.""", + data_check_name=datetime_format_check_name, + message_code=DataCheckMessageCode.DATETIME_HAS_MISALIGNED_VALUES, + ).to_dict(), + get_uneven_error("dates", ww_payload, series), + ], + ) + assert datetime_format_check.validate(X, pd.Series(dtype="float64")) == messages From fe129ff75f5cbc8e89ec7589298afb27b799bdaa Mon Sep 17 00:00:00 2001 From: MichaelFu512 Date: Thu, 7 Sep 2023 12:28:27 -0700 Subject: [PATCH 04/11] release notes --- docs/source/release_notes.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/source/release_notes.rst b/docs/source/release_notes.rst index 06a28a2e69..d7ac409291 100644 --- a/docs/source/release_notes.rst +++ b/docs/source/release_notes.rst @@ -5,6 +5,7 @@ Release Notes * Extended STLDecomposer to Support Multiseries :pr:`4253` * Extended TimeSeriesImputer to handle multiseries :pr:`4291` * Added datacheck to check for mismatched series length in multiseries :pr:`4296` + * Extended DateTimeFormatCheck data check to handle multiseries :pr:`4300` * Fixes * Changes * Documentation Changes From 9186484c6cf0df5ce734fb792fb964a8c7d02813 Mon Sep 17 00:00:00 2001 From: MichaelFu512 Date: Thu, 7 Sep 2023 12:30:07 -0700 Subject: [PATCH 05/11] release notes again --- docs/source/release_notes.rst | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/source/release_notes.rst b/docs/source/release_notes.rst index d7ac409291..c990906b76 100644 --- a/docs/source/release_notes.rst +++ b/docs/source/release_notes.rst @@ -2,10 +2,10 @@ Release Notes ------------- **Future Releases** * Enhancements - * Extended STLDecomposer to Support Multiseries :pr:`4253` - * Extended TimeSeriesImputer to handle multiseries :pr:`4291` + * Extended STLDecomposer to support Multiseries :pr:`4253` + * Extended TimeSeriesImputer to support multiseries :pr:`4291` * Added datacheck to check for mismatched series length in multiseries :pr:`4296` - * Extended DateTimeFormatCheck data check to handle multiseries :pr:`4300` + * Extended DateTimeFormatCheck data check to support multiseries :pr:`4300` * Fixes * Changes * Documentation Changes From df676a25965b56ba8d2265d52e93eef6e4f9097a Mon Sep 17 00:00:00 2001 From: MichaelFu512 Date: Thu, 7 Sep 2023 12:30:35 -0700 Subject: [PATCH 06/11] capitalization --- docs/source/release_notes.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/release_notes.rst b/docs/source/release_notes.rst index c990906b76..e40729c7e1 100644 --- a/docs/source/release_notes.rst +++ b/docs/source/release_notes.rst @@ -2,7 +2,7 @@ Release Notes ------------- **Future Releases** * Enhancements - * Extended STLDecomposer to support Multiseries :pr:`4253` + * Extended STLDecomposer to support multiseries :pr:`4253` * Extended TimeSeriesImputer to support multiseries :pr:`4291` * Added datacheck to check for mismatched series length in multiseries :pr:`4296` * Extended DateTimeFormatCheck data check to support multiseries :pr:`4300` From d3ee567b960447924d0e7eac991e4d8272ed8df9 Mon Sep 17 00:00:00 2001 From: MichaelFu512 Date: Thu, 7 Sep 2023 14:45:29 -0700 Subject: [PATCH 07/11] removed random comment --- .../test_datetime_format_data_check.py | 22 +++++++++---------- 1 file changed, 10 insertions(+), 12 deletions(-) diff --git a/evalml/tests/data_checks_tests/test_datetime_format_data_check.py b/evalml/tests/data_checks_tests/test_datetime_format_data_check.py index 9161a4ca17..156fba343e 100644 --- a/evalml/tests/data_checks_tests/test_datetime_format_data_check.py +++ b/evalml/tests/data_checks_tests/test_datetime_format_data_check.py @@ -528,7 +528,7 @@ def test_datetime_format_data_check_typeerror_uneven_intervals_multiseries( input_type, datetime_loc, ): - # there's 60 entries in the dataframe (30 per series) + # there's 60 entries in the dataframe (30 entries per series) X, y = pd.DataFrame({"features": range(60)}), pd.Series(range(60)) X["series_id"] = pd.Series(list(range(2)) * 30, dtype="str") @@ -609,7 +609,7 @@ def test_datetime_format_data_check_typeerror_uneven_intervals_multiseries( else: dates = X[X[datetime_format_check.series_id] == series][datetime_column] - ww_payload = infer_frequency( + ww_payload_expected = infer_frequency( dates.reset_index(drop=True), debug=True, window_length=WINDOW_LENGTH, @@ -627,7 +627,7 @@ def test_datetime_format_data_check_typeerror_uneven_intervals_multiseries( data_check_name=datetime_format_check_name, message_code=DataCheckMessageCode.DATETIME_IS_MISSING_VALUES, ).to_dict(), - get_uneven_error(col_name, ww_payload, series), + get_uneven_error(col_name, ww_payload_expected, series), ], ) elif issue == "redundant": @@ -638,7 +638,7 @@ def test_datetime_format_data_check_typeerror_uneven_intervals_multiseries( data_check_name=datetime_format_check_name, message_code=DataCheckMessageCode.DATETIME_HAS_REDUNDANT_ROW, ).to_dict(), - get_uneven_error(col_name, ww_payload, series), + get_uneven_error(col_name, ww_payload_expected, series), ], ) else: @@ -681,8 +681,6 @@ def test_datetime_format_data_check_multiple_missing_multiseries(n_missing): dates = dates.append(pd.date_range("2021-01-17", periods=36).repeat(2)).drop( "2021-01-22", ) - # print(dates) - # assert 1 == 0 elif n_missing == 5: # A chunk of 5 missing days in a row dates = dates.append(pd.date_range("2021-01-21", periods=35).repeat(2)) @@ -703,7 +701,7 @@ def test_datetime_format_data_check_multiple_missing_multiseries(n_missing): messages = [] for series in X["series_id"].unique(): - ww_payload = infer_frequency( + ww_payload_expected = infer_frequency( X[X["series_id"] == series]["dates"].reset_index(drop=True), debug=True, window_length=WINDOW_LENGTH, @@ -716,7 +714,7 @@ def test_datetime_format_data_check_multiple_missing_multiseries(n_missing): data_check_name=datetime_format_check_name, message_code=DataCheckMessageCode.DATETIME_IS_MISSING_VALUES, ).to_dict(), - get_uneven_error("dates", ww_payload, series), + get_uneven_error("dates", ww_payload_expected, series), ], ) assert len(messages) == 4 @@ -736,7 +734,7 @@ def test_datetime_format_data_check_nan_multiseries(nans): messages = [] for series in X["series_id"].unique(): - ww_payload = infer_frequency( + ww_payload_expected = infer_frequency( X[X["series_id"] == series]["date"].reset_index(drop=True), debug=True, window_length=WINDOW_LENGTH, @@ -750,7 +748,7 @@ def test_datetime_format_data_check_nan_multiseries(nans): data_check_name=DateTimeFormatDataCheck.name, message_code=DataCheckMessageCode.DATETIME_HAS_NAN, ).to_dict(), - get_uneven_error("date", ww_payload, series), + get_uneven_error("date", ww_payload_expected, series), ], ) @@ -778,7 +776,7 @@ def test_datetime_format_data_check_multiseries_not_aligned_frequency(): messages = [] for series in X["series_id"].unique(): - ww_payload = infer_frequency( + ww_payload_expected = infer_frequency( X[X["series_id"] == series]["dates"].reset_index(drop=True), debug=True, window_length=WINDOW_LENGTH, @@ -797,7 +795,7 @@ def test_datetime_format_data_check_multiseries_not_aligned_frequency(): data_check_name=datetime_format_check_name, message_code=DataCheckMessageCode.DATETIME_HAS_MISALIGNED_VALUES, ).to_dict(), - get_uneven_error("dates", ww_payload, series), + get_uneven_error("dates", ww_payload_expected, series), ], ) assert datetime_format_check.validate(X, pd.Series(dtype="float64")) == messages From b4061cf74918cc282b354e364d52cc683b52ca50 Mon Sep 17 00:00:00 2001 From: MichaelFu512 Date: Thu, 7 Sep 2023 16:21:38 -0700 Subject: [PATCH 08/11] docstring test --- .../data_checks/datetime_format_data_check.py | 56 ++++++++++++++++++- 1 file changed, 55 insertions(+), 1 deletion(-) diff --git a/evalml/data_checks/datetime_format_data_check.py b/evalml/data_checks/datetime_format_data_check.py index 68fb9ab2d8..4d32b5208b 100644 --- a/evalml/data_checks/datetime_format_data_check.py +++ b/evalml/data_checks/datetime_format_data_check.py @@ -310,7 +310,61 @@ def validate(self, X, y): ... ] ... } ... ] - ... + + For multiseries, the datacheck will go through each series and perform checks on them similar to the single series case + To denote that the datacheck is checking a multiseries, pass in the name of the series_id column to the datacheck + + >>> X = pd.DataFrame( + ... { + ... "date": pd.date_range("2021-01-01", periods=15).repeat(2), + ... "series_id": pd.Series(list(range(2)) * 15, dtype="str") + ... } + ... ) + >>> X = X.drop([15]) + >>> dc = DateTimeFormatDataCheck(datetime_column="date", series_id="series_id") + >>> ww_payload_expected_series1 = infer_frequency((X[X["series_id"] == "1"]["date"].reset_index(drop=True)), debug=True, window_length=4, threshold=0.4) + >>> xd = dc.validate(X,y) + >>> assert dc.validate(X, y) == [ + ... { + ... "message": "Column 'date' for series '1' has datetime values missing between start and end date.", + ... "data_check_name": "DateTimeFormatDataCheck", + ... "level": "error", + ... "details": {"columns": None, "rows": None}, + ... "code": "DATETIME_IS_MISSING_VALUES", + ... "action_options": [] + ... }, + ... { + ... "message": "A frequency was detected in column 'date' for series '1', but there are faulty datetime values that need to be addressed.", + ... "data_check_name": "DateTimeFormatDataCheck", + ... "level": "error", + ... "code": "DATETIME_HAS_UNEVEN_INTERVALS", + ... "details": {'columns': None, 'rows': None}, + ... "action_options": [ + ... { + ... 'code': 'REGULARIZE_AND_IMPUTE_DATASET', + ... 'data_check_name': 'DateTimeFormatDataCheck', + ... 'metadata': { + ... 'columns': None, + ... 'is_target': True, + ... 'rows': None + ... }, + ... 'parameters': { + ... 'time_index': { + ... 'default_value': 'date', + ... 'parameter_type': 'global', + ... 'type': 'str' + ... }, + ... 'frequency_payload': { + ... 'default_value': ww_payload_expected_series1, + ... 'parameter_type': 'global', + ... 'type': 'tuple' + ... } + ... } + ... } + ... ] + ... } + ... ] + """ messages = [] From 9f266cfb1325cacc3e35d7c069ab39937316dca6 Mon Sep 17 00:00:00 2001 From: MichaelFu512 Date: Mon, 11 Sep 2023 13:50:16 -0700 Subject: [PATCH 09/11] fixed tests and small changes --- .../data_checks/datetime_format_data_check.py | 22 +- .../test_datetime_format_data_check.py | 491 ++++++++---------- 2 files changed, 228 insertions(+), 285 deletions(-) diff --git a/evalml/data_checks/datetime_format_data_check.py b/evalml/data_checks/datetime_format_data_check.py index 4d32b5208b..76d3785a22 100644 --- a/evalml/data_checks/datetime_format_data_check.py +++ b/evalml/data_checks/datetime_format_data_check.py @@ -402,18 +402,17 @@ def validate(self, X, y): ) return messages - series_datetime = ( - [datetime_values] if self.series_id is None else X[self.series_id].unique() - ) + series_datetime = [0] if self.series_id is None else X[self.series_id].unique() for series in series_datetime: # if multiseries only select the datetimes corresponding to one series if is_multiseries: + curr_series_df = X[X[self.series_id] == series] if self.datetime_column != "index": - datetime_values = X[X[self.series_id] == series][ - self.datetime_column - ].reset_index(drop=True) + datetime_values = curr_series_df[self.datetime_column].reset_index( + drop=True, + ) else: - datetime_values = X[X[self.series_id] == series].index + datetime_values = curr_series_df.index # Check if the data is monotonically increasing no_nan_datetime_values = datetime_values.dropna() @@ -439,11 +438,10 @@ def validate(self, X, y): ) inferred_freq = ww_payload[0] debug_object = ww_payload[1] - if inferred_freq is not None: - if is_multiseries: - continue - else: - return messages + if inferred_freq is not None and is_multiseries: + continue + elif inferred_freq is not None: + return messages # Check for NaN values if len(debug_object["nan_values"]) > 0: diff --git a/evalml/tests/data_checks_tests/test_datetime_format_data_check.py b/evalml/tests/data_checks_tests/test_datetime_format_data_check.py index 156fba343e..f75b9a6406 100644 --- a/evalml/tests/data_checks_tests/test_datetime_format_data_check.py +++ b/evalml/tests/data_checks_tests/test_datetime_format_data_check.py @@ -54,31 +54,49 @@ def get_uneven_error(col_name, ww_payload, series=None): "issue", ["redundant", "missing", "uneven", "type_errors", None], ) -@pytest.mark.parametrize("datetime_loc", [1, "X_index", "y_index"]) +@pytest.mark.parametrize( + "datetime_loc, is_multiseries, repeat", + [ + (1, True, 2), + (1, False, 1), + ("X_index", True, 2), + ("X_index", False, 1), + ("y_index", False, 1), + ], +) def test_datetime_format_data_check_typeerror_uneven_intervals( issue, input_type, datetime_loc, + is_multiseries, + repeat, ): - X, y = pd.DataFrame({"features": range(30)}), pd.Series(range(30)) + if is_multiseries: + time_length = 60 + else: + time_length = 30 + + X, y = pd.DataFrame({"features": range(time_length)}), pd.Series(range(time_length)) + if is_multiseries: + X["series_id"] = pd.Series(list(range(2)) * 30, dtype="str") if issue == "type_errors": - dates = range(30) + dates = range(time_length) else: - dates = pd.date_range("2021-01-01", periods=30) + dates = pd.date_range("2021-01-01", periods=time_length) if issue == "missing": # Skips 2021-01-25 and starts again at 2021-01-27, skipping a date and triggering the error - dates = pd.date_range("2021-01-01", periods=25).append( - pd.date_range("2021-01-27", periods=5), + dates = (pd.date_range("2021-01-01", periods=25).repeat(repeat)).append( + (pd.date_range("2021-01-27", periods=5).repeat(repeat)), ) if issue == "uneven": - dates_1 = pd.date_range("2015-01-01", periods=5, freq="D") - dates_2 = pd.date_range("2015-01-08", periods=3, freq="D") - dates_3 = pd.DatetimeIndex(["2015-01-12"]) - dates_4 = pd.date_range("2015-01-15", periods=5, freq="D") - dates_5 = pd.date_range("2015-01-22", periods=5, freq="D") - dates_6 = pd.date_range("2015-01-29", periods=11, freq="M") + dates_1 = pd.date_range("2015-01-01", periods=5, freq="D").repeat(repeat) + dates_2 = pd.date_range("2015-01-08", periods=3, freq="D").repeat(repeat) + dates_3 = pd.DatetimeIndex(["2015-01-12"]).repeat(repeat) + dates_4 = pd.date_range("2015-01-15", periods=5, freq="D").repeat(repeat) + dates_5 = pd.date_range("2015-01-22", periods=5, freq="D").repeat(repeat) + dates_6 = pd.date_range("2015-01-29", periods=11, freq="M").repeat(repeat) dates = ( dates_1.append(dates_2) @@ -88,11 +106,12 @@ def test_datetime_format_data_check_typeerror_uneven_intervals( .append(dates_6) ) if issue == "redundant": - dates = pd.date_range("2021-01-01", periods=29).append( - pd.date_range("2021-01-29", periods=1), + dates = (pd.date_range("2021-01-01", periods=29).repeat(repeat)).append( + (pd.date_range("2021-01-29", periods=1).repeat(repeat)), ) datetime_column = "index" + if datetime_loc == 1: X[datetime_loc] = dates datetime_column = datetime_loc @@ -105,59 +124,125 @@ def test_datetime_format_data_check_typeerror_uneven_intervals( X.ww.init() y.ww.init() - datetime_format_check = DateTimeFormatDataCheck(datetime_column=datetime_column) - - if issue == "type_errors": - assert datetime_format_check.validate(X, y) == [ - DataCheckError( - message="Datetime information could not be found in the data, or was not in a supported datetime format.", - data_check_name=datetime_format_check_name, - message_code=DataCheckMessageCode.DATETIME_INFORMATION_NOT_FOUND, - ).to_dict(), - ] - else: - if datetime_loc == "X_index": - dates = pd.Series(X.index) - elif datetime_loc == "y_index": - dates = pd.Series(y.index) - else: - dates = X[datetime_column] - ww_payload = infer_frequency( - dates, - debug=True, - window_length=WINDOW_LENGTH, - threshold=THRESHOLD, + if is_multiseries: + datetime_format_check = DateTimeFormatDataCheck( + datetime_column=datetime_column, + series_id="series_id", ) + else: + datetime_format_check = DateTimeFormatDataCheck(datetime_column=datetime_column) - col_name = datetime_loc if datetime_loc == 1 else "either index" - if issue is None: - assert datetime_format_check.validate(X, y) == [] - elif issue == "missing": - assert datetime_format_check.validate(X, y) == [ - DataCheckError( - message=f"Column '{col_name}' has datetime values missing between start and end date.", - data_check_name=datetime_format_check_name, - message_code=DataCheckMessageCode.DATETIME_IS_MISSING_VALUES, - ).to_dict(), - get_uneven_error(col_name, ww_payload), - ] - elif issue == "redundant": - assert datetime_format_check.validate(X, y) == [ - DataCheckError( - message=f"Column '{col_name}' has more than one row with the same datetime value.", - data_check_name=datetime_format_check_name, - message_code=DataCheckMessageCode.DATETIME_HAS_REDUNDANT_ROW, - ).to_dict(), - get_uneven_error(col_name, ww_payload), - ] + all_series = X["series_id"].unique() if is_multiseries else [0] + messages = [] + + for series in all_series: + if issue == "type_errors": + if len(messages) == 0: + # type error only gives 1 message regardless of how many series there are + messages.append( + DataCheckError( + message="Datetime information could not be found in the data, or was not in a supported datetime format.", + data_check_name=datetime_format_check_name, + message_code=DataCheckMessageCode.DATETIME_INFORMATION_NOT_FOUND, + ).to_dict(), + ) else: - assert datetime_format_check.validate(X, y) == [ - DataCheckError( - message=f"No frequency could be detected in column '{col_name}', possibly due to uneven intervals or too many duplicate/missing values.", - data_check_name=datetime_format_check_name, - message_code=DataCheckMessageCode.DATETIME_NO_FREQUENCY_INFERRED, - ).to_dict(), - ] + if is_multiseries: + curr_series_df = X[X[datetime_format_check.series_id] == series] + + # separates the datetimes so it only displays the dates that correspond to the current series + if input_type == "ww" and is_multiseries: + # ww makes the series_id into ints so need to cast series into ints + if datetime_loc == "X_index": + dates = pd.Series( + X[X[datetime_format_check.series_id] == int(series)].index, + ) + else: + dates = X[X[datetime_format_check.series_id] == int(series)][ + datetime_column + ] + elif datetime_loc == "X_index": + if is_multiseries: + dates = pd.Series(curr_series_df.index) + else: + dates = pd.Series(X.index) + elif datetime_loc == "y_index": + dates = pd.Series(y.index) + else: + if is_multiseries: + dates = pd.Series(curr_series_df[datetime_column]) + else: + dates = X[datetime_column] + ww_payload_expected = infer_frequency( + # this part might cause issues + dates.reset_index(drop=True), + debug=True, + window_length=WINDOW_LENGTH, + threshold=THRESHOLD, + ) + + col_name = datetime_loc if datetime_loc == 1 else "either index" + if issue is None: + break + elif issue == "missing": + if is_multiseries: + message = f"Column '{col_name}' for series '{series}' has datetime values missing between start and end date." + uneven_error = get_uneven_error( + col_name, + ww_payload_expected, + series, + ) + else: + message = f"Column '{col_name}' has datetime values missing between start and end date." + uneven_error = get_uneven_error(col_name, ww_payload_expected) + messages.extend( + [ + DataCheckError( + message=message, + data_check_name=datetime_format_check_name, + message_code=DataCheckMessageCode.DATETIME_IS_MISSING_VALUES, + ).to_dict(), + uneven_error, + ], + ) + elif issue == "redundant": + if is_multiseries: + message = f"Column '{col_name}' for series '{series}' has more than one row with the same datetime value." + uneven_error = get_uneven_error( + col_name, + ww_payload_expected, + series, + ) + else: + message = f"Column '{col_name}' has more than one row with the same datetime value." + uneven_error = get_uneven_error(col_name, ww_payload_expected) + messages.extend( + [ + DataCheckError( + message=message, + data_check_name=datetime_format_check_name, + message_code=DataCheckMessageCode.DATETIME_HAS_REDUNDANT_ROW, + ).to_dict(), + uneven_error, + ], + ) + else: + if is_multiseries: + message = f"No frequency could be detected in column '{col_name}' for series '{series}', possibly due to uneven intervals or too many duplicate/missing values." + else: + message = f"No frequency could be detected in column '{col_name}', possibly due to uneven intervals or too many duplicate/missing values." + + messages.append( + DataCheckError( + message=message, + data_check_name=datetime_format_check_name, + message_code=DataCheckMessageCode.DATETIME_NO_FREQUENCY_INFERRED, + ).to_dict(), + ) + if issue is None: + assert datetime_format_check.validate(X, y) == [] + else: + assert datetime_format_check.validate(X, y) == messages @pytest.mark.parametrize("sort_order", ["increasing", "decreasing", "mixed"]) @@ -205,40 +290,89 @@ def test_datetime_format_data_check_monotonic(datetime_loc, sort_order): @pytest.mark.parametrize("n_missing", [2, 5, 7]) -def test_datetime_format_data_check_multiple_missing(n_missing): +@pytest.mark.parametrize("is_multiseries, repeat", [(True, 2), (False, 1)]) +def test_datetime_format_data_check_multiple_missing(n_missing, is_multiseries, repeat): X, y = pd.DataFrame({"features": range(100)}), pd.Series(range(100)) + if is_multiseries: + X["series_id"] = pd.Series(list(range(2)) * 50, dtype="str") - dates = pd.date_range("2021-01-01", periods=15) + dates = pd.date_range("2021-01-01", periods=15).repeat(repeat) if n_missing == 2: # Two missing dates in separate spots - dates = dates.append(pd.date_range("2021-01-17", periods=86)).drop("2021-01-22") + if is_multiseries: + dates = dates.append( + pd.date_range("2021-01-17", periods=36).repeat(2), + ).drop( + "2021-01-22", + ) + else: + dates = dates.append(pd.date_range("2021-01-17", periods=86)).drop( + "2021-01-22", + ) elif n_missing == 5: # A chunk of 5 missing days in a row - dates = dates.append(pd.date_range("2021-01-21", periods=85)) + if is_multiseries: + dates = dates.append(pd.date_range("2021-01-21", periods=35).repeat(2)) + else: + dates = dates.append(pd.date_range("2021-01-21", periods=85)) else: # Some chunks missing and some alone missing - dates = dates.append(pd.date_range("2021-01-20", periods=88)).drop("2021-01-27") + if is_multiseries: + dates = dates.append( + pd.date_range("2021-01-19", periods=39).repeat(2), + ).drop( + "2021-01-27", + ) + dates = dates.drop("2021-01-20") + else: + dates = dates.append(pd.date_range("2021-01-20", periods=88)).drop( + "2021-01-27", + ) dates = dates.drop("2021-02-22") dates = dates.drop("2021-01-11") X["dates"] = dates - datetime_format_check = DateTimeFormatDataCheck(datetime_column="dates") - ww_payload = infer_frequency( - X["dates"], - debug=True, - window_length=WINDOW_LENGTH, - threshold=THRESHOLD, - ) + if is_multiseries: + datetime_format_check = DateTimeFormatDataCheck( + datetime_column="dates", + series_id="series_id", + ) + else: + datetime_format_check = DateTimeFormatDataCheck(datetime_column="dates") - assert datetime_format_check.validate(X, y) == [ - DataCheckError( - message="Column 'dates' has datetime values missing between start and end date.", - data_check_name=datetime_format_check_name, - message_code=DataCheckMessageCode.DATETIME_IS_MISSING_VALUES, - ).to_dict(), - get_uneven_error("dates", ww_payload), - ] + messages = [] + series_list = X["series_id"].unique() if is_multiseries else [0] + + for series in series_list: + observed_ts = ( + X[X["series_id"] == series]["dates"].reset_index(drop=True) + if is_multiseries + else X["dates"] + ) + ww_payload_expected = infer_frequency( + observed_ts, + debug=True, + window_length=WINDOW_LENGTH, + threshold=THRESHOLD, + ) + if is_multiseries: + message = f"""Column 'dates' for series '{series}' has datetime values missing between start and end date.""" + uneven_error = get_uneven_error("dates", ww_payload_expected, series) + else: + message = """Column 'dates' has datetime values missing between start and end date.""" + uneven_error = get_uneven_error("dates", ww_payload_expected) + messages.extend( + [ + DataCheckError( + message=message, + data_check_name=datetime_format_check_name, + message_code=DataCheckMessageCode.DATETIME_IS_MISSING_VALUES, + ).to_dict(), + uneven_error, + ], + ) + assert datetime_format_check.validate(X, y) == messages def test_datetime_format_data_check_multiple_errors(): @@ -517,144 +651,6 @@ def test_datetime_many_duplicates_and_nans(): assert result[2]["code"] == "DATETIME_NO_FREQUENCY_INFERRED" -@pytest.mark.parametrize("input_type", ["pd", "ww"]) -@pytest.mark.parametrize( - "issue", - ["redundant", "missing", "uneven", "type_errors", None], -) -@pytest.mark.parametrize("datetime_loc", [1, "X_index"]) -def test_datetime_format_data_check_typeerror_uneven_intervals_multiseries( - issue, - input_type, - datetime_loc, -): - # there's 60 entries in the dataframe (30 entries per series) - X, y = pd.DataFrame({"features": range(60)}), pd.Series(range(60)) - X["series_id"] = pd.Series(list(range(2)) * 30, dtype="str") - - if issue == "type_errors": - dates = range(60) - else: - dates = pd.date_range("2021-01-01", periods=60) - - if issue == "missing": - # Skips 2021-01-25 and starts again at 2021-01-27, skipping a date and triggering the error - dates = (pd.date_range("2021-01-01", periods=25).repeat(2)).append( - (pd.date_range("2021-01-27", periods=5).repeat(2)), - ) - - if issue == "uneven": - dates_1 = pd.date_range("2015-01-01", periods=5, freq="D").repeat(2) - dates_2 = pd.date_range("2015-01-08", periods=3, freq="D").repeat(2) - dates_3 = pd.DatetimeIndex(["2015-01-12"]).repeat(2) - dates_4 = pd.date_range("2015-01-15", periods=5, freq="D").repeat(2) - dates_5 = pd.date_range("2015-01-22", periods=5, freq="D").repeat(2) - dates_6 = pd.date_range("2015-01-29", periods=11, freq="M").repeat(2) - - dates = ( - dates_1.append(dates_2) - .append(dates_3) - .append(dates_4) - .append(dates_5) - .append(dates_6) - ) - if issue == "redundant": - # 2021-01-25 repeats twice which triggers an error - dates = (pd.date_range("2021-01-01", periods=25).repeat(2)).append( - (pd.date_range("2021-01-25", periods=5).repeat(2)), - ) - datetime_column = "index" - - if datetime_loc == 1: - X[datetime_loc] = dates - datetime_column = datetime_loc - else: - X.index = dates - - if input_type == "ww": - X.ww.init() - y.ww.init() - - datetime_format_check = DateTimeFormatDataCheck( - datetime_column=datetime_column, - series_id="series_id", - ) - - messages = [] - for series in X["series_id"].unique(): - if issue == "type_errors": - # type error only has 1 message regardless of how many series there are - if len(messages) == 0: - messages.append( - DataCheckError( - message="Datetime information could not be found in the data, or was not in a supported datetime format.", - data_check_name=datetime_format_check_name, - message_code=DataCheckMessageCode.DATETIME_INFORMATION_NOT_FOUND, - ).to_dict(), - ) - else: - # separates the datetimes so it only displays the dates that correspond to the current series - if input_type == "ww": - # ww makes the series_id into ints so need to cast - if datetime_loc == "X_index": - dates = pd.Series( - X[X[datetime_format_check.series_id] == int(series)].index, - ) - else: - dates = X[X[datetime_format_check.series_id] == int(series)][ - datetime_column - ] - elif datetime_loc == "X_index": - dates = pd.Series(X[X[datetime_format_check.series_id] == series].index) - else: - dates = X[X[datetime_format_check.series_id] == series][datetime_column] - - ww_payload_expected = infer_frequency( - dates.reset_index(drop=True), - debug=True, - window_length=WINDOW_LENGTH, - threshold=THRESHOLD, - ) - - col_name = datetime_loc if datetime_loc == 1 else "either index" - if issue is None: - break - elif issue == "missing": - messages.extend( - [ - DataCheckError( - message=f"Column '{col_name}' for series '{series}' has datetime values missing between start and end date.", - data_check_name=datetime_format_check_name, - message_code=DataCheckMessageCode.DATETIME_IS_MISSING_VALUES, - ).to_dict(), - get_uneven_error(col_name, ww_payload_expected, series), - ], - ) - elif issue == "redundant": - messages.extend( - [ - DataCheckError( - message=f"Column '{col_name}' for series '{series}' has more than one row with the same datetime value.", - data_check_name=datetime_format_check_name, - message_code=DataCheckMessageCode.DATETIME_HAS_REDUNDANT_ROW, - ).to_dict(), - get_uneven_error(col_name, ww_payload_expected, series), - ], - ) - else: - messages.append( - DataCheckError( - message=f"No frequency could be detected in column '{col_name}' for series '{series}', possibly due to uneven intervals or too many duplicate/missing values.", - data_check_name=datetime_format_check_name, - message_code=DataCheckMessageCode.DATETIME_NO_FREQUENCY_INFERRED, - ).to_dict(), - ) - if issue is None: - assert datetime_format_check.validate(X, y) == [] - else: - assert datetime_format_check.validate(X, y) == messages - - def test_datetime_format_data_check_invalid_seriesid_multiseries( multiseries_ts_data_stacked, ): @@ -670,57 +666,6 @@ def test_datetime_format_data_check_invalid_seriesid_multiseries( datetime_format_check.validate(X, y) -@pytest.mark.parametrize("n_missing", [2, 5, 7]) -def test_datetime_format_data_check_multiple_missing_multiseries(n_missing): - X, y = pd.DataFrame({"features": range(100)}), pd.Series(range(100)) - X["series_id"] = pd.Series(list(range(2)) * 50, dtype="str") - - dates = pd.date_range("2021-01-01", periods=15).repeat(2) - if n_missing == 2: - # Two missing dates in separate spots - dates = dates.append(pd.date_range("2021-01-17", periods=36).repeat(2)).drop( - "2021-01-22", - ) - elif n_missing == 5: - # A chunk of 5 missing days in a row - dates = dates.append(pd.date_range("2021-01-21", periods=35).repeat(2)) - else: - # Some chunks missing and some alone missing - dates = dates.append(pd.date_range("2021-01-19", periods=39).repeat(2)).drop( - "2021-01-27", - ) - dates = dates.drop("2021-02-22") - dates = dates.drop("2021-01-11") - dates = dates.drop("2021-01-20") - - X["dates"] = dates - datetime_format_check = DateTimeFormatDataCheck( - datetime_column="dates", - series_id="series_id", - ) - - messages = [] - for series in X["series_id"].unique(): - ww_payload_expected = infer_frequency( - X[X["series_id"] == series]["dates"].reset_index(drop=True), - debug=True, - window_length=WINDOW_LENGTH, - threshold=THRESHOLD, - ) - messages.extend( - [ - DataCheckError( - message=f"""Column 'dates' for series '{series}' has datetime values missing between start and end date.""", - data_check_name=datetime_format_check_name, - message_code=DataCheckMessageCode.DATETIME_IS_MISSING_VALUES, - ).to_dict(), - get_uneven_error("dates", ww_payload_expected, series), - ], - ) - assert len(messages) == 4 - assert datetime_format_check.validate(X, y) == messages - - @pytest.mark.parametrize("nans", [0, 1, 2]) def test_datetime_format_data_check_nan_multiseries(nans): dates = pd.Series(pd.date_range(start="2021-01-01", periods=20).repeat(2)) From 982888702aa6b56c01372760d3a55aae0c417a1e Mon Sep 17 00:00:00 2001 From: Michael Fu Date: Tue, 12 Sep 2023 11:24:23 -0700 Subject: [PATCH 10/11] Update evalml/data_checks/datetime_format_data_check.py Co-authored-by: Jeremy Shih --- evalml/data_checks/datetime_format_data_check.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/evalml/data_checks/datetime_format_data_check.py b/evalml/data_checks/datetime_format_data_check.py index 76d3785a22..ac9495afdf 100644 --- a/evalml/data_checks/datetime_format_data_check.py +++ b/evalml/data_checks/datetime_format_data_check.py @@ -41,7 +41,7 @@ def validate(self, X, y): Will return DataCheckError(s) if the data is not a datetime type, is not increasing, has redundant or missing row(s), contains invalid (NaN or None) values, or has values that don't align with the assumed frequency. - If used for multiseries problem, works specifically on stacked datasets + If used for multiseries problem, works specifically on stacked datasets. Args: X (pd.DataFrame, np.ndarray): Features. From 8c118c0e492e3ad694420721705b51d6c3029a52 Mon Sep 17 00:00:00 2001 From: Michael Fu Date: Tue, 12 Sep 2023 11:24:38 -0700 Subject: [PATCH 11/11] Update evalml/data_checks/datetime_format_data_check.py Co-authored-by: Jeremy Shih --- evalml/data_checks/datetime_format_data_check.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/evalml/data_checks/datetime_format_data_check.py b/evalml/data_checks/datetime_format_data_check.py index ac9495afdf..9aaaac3244 100644 --- a/evalml/data_checks/datetime_format_data_check.py +++ b/evalml/data_checks/datetime_format_data_check.py @@ -16,7 +16,7 @@ class DateTimeFormatDataCheck(DataCheck): """Check if the datetime column has equally spaced intervals and is monotonically increasing or decreasing in order to be supported by time series estimators. - If used for multiseries problem, works specifically on stacked datasets + If used for multiseries problem, works specifically on stacked datasets. Args: datetime_column (str, int): The name of the datetime column. If the datetime values are in the index, then pass "index".