diff --git a/docs/source/release_notes.rst b/docs/source/release_notes.rst index 3f5b6707ec..bfe451047c 100644 --- a/docs/source/release_notes.rst +++ b/docs/source/release_notes.rst @@ -2,10 +2,11 @@ Release Notes ------------- **Future Releases** * Enhancements - * Extended STLDecomposer to Support Multiseries :pr:`4253` - * Extended TimeSeriesImputer to handle multiseries :pr:`4291` + * Extended STLDecomposer to support multiseries :pr:`4253` + * Extended TimeSeriesImputer to support multiseries :pr:`4291` * Added datacheck to check for mismatched series length in multiseries :pr:`4296` * Added STLDecomposer to multiseries pipelines :pr:`4299` + * Extended DateTimeFormatCheck data check to support multiseries :pr:`4300` * Fixes * Changes * Documentation Changes diff --git a/evalml/data_checks/datetime_format_data_check.py b/evalml/data_checks/datetime_format_data_check.py index a3c9af8082..9aaaac3244 100644 --- a/evalml/data_checks/datetime_format_data_check.py +++ b/evalml/data_checks/datetime_format_data_check.py @@ -16,22 +16,33 @@ class DateTimeFormatDataCheck(DataCheck): """Check if the datetime column has equally spaced intervals and is monotonically increasing or decreasing in order to be supported by time series estimators. + If used for multiseries problem, works specifically on stacked datasets. + Args: datetime_column (str, int): The name of the datetime column. If the datetime values are in the index, then pass "index". nan_duplicate_threshold (float): The percentage of values in the `datetime_column` that must not be duplicate or nan before `DATETIME_NO_FREQUENCY_INFERRED` is returned instead of `DATETIME_HAS_UNEVEN_INTERVALS`. For example, if this is set to 0.80, then only 20% of the values in `datetime_column` can be duplicate or nan. Defaults to 0.75. + series_id (str): The name of the series_id column for multiseries. Defaults to None """ - def __init__(self, datetime_column="index", nan_duplicate_threshold=0.75): + def __init__( + self, + datetime_column="index", + nan_duplicate_threshold=0.75, + series_id=None, + ): self.datetime_column = datetime_column self.nan_duplicate_threshold = nan_duplicate_threshold + self.series_id = series_id def validate(self, X, y): """Checks if the target data has equal intervals and is monotonically increasing. - Will return a DataCheckError if the data is not a datetime type, is not increasing, has redundant or missing row(s), + Will return DataCheckError(s) if the data is not a datetime type, is not increasing, has redundant or missing row(s), contains invalid (NaN or None) values, or has values that don't align with the assumed frequency. + If used for multiseries problem, works specifically on stacked datasets. + Args: X (pd.DataFrame, np.ndarray): Features. y (pd.Series, np.ndarray): Target data. @@ -199,52 +210,6 @@ def validate(self, X, y): ... } ... ] - The column "Weeks" has a date that does not follow the weekly pattern, which is considered misaligned. - - >>> X = pd.DataFrame(pd.date_range("2021-01-01", freq="W", periods=12).append(pd.date_range("2021-03-22", periods=1)), columns=["Weeks"]) - >>> ww_payload = infer_frequency(X["Weeks"], debug=True, window_length=5, threshold=0.8) - >>> datetime_format_dc = DateTimeFormatDataCheck(datetime_column="Weeks") - >>> assert datetime_format_dc.validate(X, y) == [ - ... { - ... "message": "Column 'Weeks' has datetime values that do not align with the inferred frequency.", - ... "data_check_name": "DateTimeFormatDataCheck", - ... "level": "error", - ... "details": {"columns": None, "rows": None}, - ... "code": "DATETIME_HAS_MISALIGNED_VALUES", - ... "action_options": [] - ... }, - ... { - ... "message": "A frequency was detected in column 'Weeks', but there are faulty datetime values that need to be addressed.", - ... "data_check_name": "DateTimeFormatDataCheck", - ... "level": "error", - ... "code": "DATETIME_HAS_UNEVEN_INTERVALS", - ... "details": {'columns': None, 'rows': None}, - ... "action_options": [ - ... { - ... 'code': 'REGULARIZE_AND_IMPUTE_DATASET', - ... 'data_check_name': 'DateTimeFormatDataCheck', - ... 'metadata': { - ... 'columns': None, - ... 'is_target': True, - ... 'rows': None - ... }, - ... 'parameters': { - ... 'time_index': { - ... 'default_value': 'Weeks', - ... 'parameter_type': 'global', - ... 'type': 'str' - ... }, - ... 'frequency_payload': { - ... 'default_value': ww_payload, - ... 'parameter_type': 'global', - ... 'type': 'tuple' - ... } - ... } - ... } - ... ] - ... } - ... ] - The column "Weeks" passed integers instead of datetime data, which will raise an error. >>> X = pd.DataFrame([1, 2, 3, 4], columns=["Weeks"]) @@ -345,15 +310,74 @@ def validate(self, X, y): ... ] ... } ... ] - ... + + For multiseries, the datacheck will go through each series and perform checks on them similar to the single series case + To denote that the datacheck is checking a multiseries, pass in the name of the series_id column to the datacheck + + >>> X = pd.DataFrame( + ... { + ... "date": pd.date_range("2021-01-01", periods=15).repeat(2), + ... "series_id": pd.Series(list(range(2)) * 15, dtype="str") + ... } + ... ) + >>> X = X.drop([15]) + >>> dc = DateTimeFormatDataCheck(datetime_column="date", series_id="series_id") + >>> ww_payload_expected_series1 = infer_frequency((X[X["series_id"] == "1"]["date"].reset_index(drop=True)), debug=True, window_length=4, threshold=0.4) + >>> xd = dc.validate(X,y) + >>> assert dc.validate(X, y) == [ + ... { + ... "message": "Column 'date' for series '1' has datetime values missing between start and end date.", + ... "data_check_name": "DateTimeFormatDataCheck", + ... "level": "error", + ... "details": {"columns": None, "rows": None}, + ... "code": "DATETIME_IS_MISSING_VALUES", + ... "action_options": [] + ... }, + ... { + ... "message": "A frequency was detected in column 'date' for series '1', but there are faulty datetime values that need to be addressed.", + ... "data_check_name": "DateTimeFormatDataCheck", + ... "level": "error", + ... "code": "DATETIME_HAS_UNEVEN_INTERVALS", + ... "details": {'columns': None, 'rows': None}, + ... "action_options": [ + ... { + ... 'code': 'REGULARIZE_AND_IMPUTE_DATASET', + ... 'data_check_name': 'DateTimeFormatDataCheck', + ... 'metadata': { + ... 'columns': None, + ... 'is_target': True, + ... 'rows': None + ... }, + ... 'parameters': { + ... 'time_index': { + ... 'default_value': 'date', + ... 'parameter_type': 'global', + ... 'type': 'str' + ... }, + ... 'frequency_payload': { + ... 'default_value': ww_payload_expected_series1, + ... 'parameter_type': 'global', + ... 'type': 'tuple' + ... } + ... } + ... } + ... ] + ... } + ... ] + """ messages = [] X = infer_feature_types(X) y = infer_feature_types(y) - + is_multiseries = self.series_id is not None no_dt_found = False + if self.series_id is not None and self.series_id not in X: + raise ValueError( + f"""series_id "{self.series_id}" is not in the dataset.""", + ) + if self.datetime_column != "index": datetime_values = X[self.datetime_column] else: @@ -378,110 +402,145 @@ def validate(self, X, y): ) return messages - # Check if the data is monotonically increasing - no_nan_datetime_values = datetime_values.dropna() - if not pd.DatetimeIndex(no_nan_datetime_values).is_monotonic_increasing: - messages.append( - DataCheckError( - message="Datetime values must be sorted in ascending order.", - data_check_name=self.name, - message_code=DataCheckMessageCode.DATETIME_IS_NOT_MONOTONIC, - ).to_dict(), - ) - - col_name = ( - self.datetime_column if self.datetime_column != "index" else "either index" - ) - - ww_payload = infer_frequency( - pd.Series(datetime_values), - debug=True, - window_length=4, - threshold=0.4, - ) - inferred_freq = ww_payload[0] - debug_object = ww_payload[1] - if inferred_freq is not None: - return messages - - # Check for NaN values - if len(debug_object["nan_values"]) > 0: - messages.append( - DataCheckError( - message=f"Input datetime column '{col_name}' contains NaN values. Please impute NaN values or drop these rows.", - data_check_name=self.name, - message_code=DataCheckMessageCode.DATETIME_HAS_NAN, - ).to_dict(), + series_datetime = [0] if self.series_id is None else X[self.series_id].unique() + for series in series_datetime: + # if multiseries only select the datetimes corresponding to one series + if is_multiseries: + curr_series_df = X[X[self.series_id] == series] + if self.datetime_column != "index": + datetime_values = curr_series_df[self.datetime_column].reset_index( + drop=True, + ) + else: + datetime_values = curr_series_df.index + + # Check if the data is monotonically increasing + no_nan_datetime_values = datetime_values.dropna() + if not pd.DatetimeIndex(no_nan_datetime_values).is_monotonic_increasing: + messages.append( + DataCheckError( + message="Datetime values must be sorted in ascending order.", + data_check_name=self.name, + message_code=DataCheckMessageCode.DATETIME_IS_NOT_MONOTONIC, + ).to_dict(), + ) + + col_name = ( + self.datetime_column + if self.datetime_column != "index" + else "either index" ) - - # Check for only one row per datetime - if len(debug_object["duplicate_values"]) > 0: - messages.append( - DataCheckError( - message=f"Column '{col_name}' has more than one row with the same datetime value.", - data_check_name=self.name, - message_code=DataCheckMessageCode.DATETIME_HAS_REDUNDANT_ROW, - ).to_dict(), + ww_payload = infer_frequency( + pd.Series(datetime_values), + debug=True, + window_length=4, + threshold=0.4, ) - - # Check for no date missing in ordered dates - if len(debug_object["missing_values"]) > 0: - messages.append( - DataCheckError( - message=f"Column '{col_name}' has datetime values missing between start and end date.", - data_check_name=self.name, - message_code=DataCheckMessageCode.DATETIME_IS_MISSING_VALUES, - ).to_dict(), + inferred_freq = ww_payload[0] + debug_object = ww_payload[1] + if inferred_freq is not None and is_multiseries: + continue + elif inferred_freq is not None: + return messages + + # Check for NaN values + if len(debug_object["nan_values"]) > 0: + series_message = f"Input datetime column '{col_name}' for series '{series}' contains NaN values. Please impute NaN values or drop these rows." + messages.append( + DataCheckError( + message=f"Input datetime column '{col_name}' contains NaN values. Please impute NaN values or drop these rows." + if not is_multiseries + else series_message, + data_check_name=self.name, + message_code=DataCheckMessageCode.DATETIME_HAS_NAN, + ).to_dict(), + ) + + # Check for only one row per datetime + if len(debug_object["duplicate_values"]) > 0: + series_message = f"Column '{col_name}' for series '{series}' has more than one row with the same datetime value." + messages.append( + DataCheckError( + message=f"Column '{col_name}' has more than one row with the same datetime value." + if not is_multiseries + else series_message, + data_check_name=self.name, + message_code=DataCheckMessageCode.DATETIME_HAS_REDUNDANT_ROW, + ).to_dict(), + ) + + # Check for no date missing in ordered dates + if len(debug_object["missing_values"]) > 0: + series_message = f"Column '{col_name}' for series '{series}' has datetime values missing between start and end date." + messages.append( + DataCheckError( + message=f"Column '{col_name}' has datetime values missing between start and end date." + if not is_multiseries + else series_message, + data_check_name=self.name, + message_code=DataCheckMessageCode.DATETIME_IS_MISSING_VALUES, + ).to_dict(), + ) + + # Check for dates that don't line up with the frequency + if len(debug_object["extra_values"]) > 0: + series_message = f"Column '{col_name}' for series '{series}' has datetime values that do not align with the inferred frequency." + messages.append( + DataCheckError( + message=f"Column '{col_name}' has datetime values that do not align with the inferred frequency." + if not is_multiseries + else series_message, + data_check_name=self.name, + message_code=DataCheckMessageCode.DATETIME_HAS_MISALIGNED_VALUES, + ).to_dict(), + ) + + datetime_values_no_nans_duplicates = ( + no_nan_datetime_values.drop_duplicates() ) - - # Check for dates that don't line up with the frequency - if len(debug_object["extra_values"]) > 0: - messages.append( - DataCheckError( - message=f"Column '{col_name}' has datetime values that do not align with the inferred frequency.", - data_check_name=self.name, - message_code=DataCheckMessageCode.DATETIME_HAS_MISALIGNED_VALUES, - ).to_dict(), - ) - - datetime_values_no_nans_duplicates = no_nan_datetime_values.drop_duplicates() - # Give a generic uneven interval error no frequency can be estimated by woodwork - if debug_object["estimated_freq"] is None or len( - datetime_values_no_nans_duplicates, - ) <= self.nan_duplicate_threshold * len(datetime_values): - messages.append( - DataCheckError( - message=f"No frequency could be detected in column '{col_name}', possibly due to uneven intervals or too many duplicate/missing values.", - data_check_name=self.name, - message_code=DataCheckMessageCode.DATETIME_NO_FREQUENCY_INFERRED, - ).to_dict(), - ) - else: - messages.append( - DataCheckError( - message=f"A frequency was detected in column '{col_name}', but there are faulty datetime values that need to be addressed.", - data_check_name=self.name, - message_code=DataCheckMessageCode.DATETIME_HAS_UNEVEN_INTERVALS, - action_options=[ - DataCheckActionOption( - DataCheckActionCode.REGULARIZE_AND_IMPUTE_DATASET, - data_check_name=self.name, - parameters={ - "time_index": { - "parameter_type": DCAOParameterType.GLOBAL, - "type": "str", - "default_value": col_name, - }, - "frequency_payload": { - "parameter_type": DCAOParameterType.GLOBAL, - "type": "tuple", - "default_value": ww_payload, + # Give a generic uneven interval error no frequency can be estimated by woodwork + if debug_object["estimated_freq"] is None or len( + datetime_values_no_nans_duplicates, + ) <= self.nan_duplicate_threshold * len(datetime_values): + series_message = f"No frequency could be detected in column '{col_name}' for series '{series}', possibly due to uneven intervals or too many duplicate/missing values." + messages.append( + DataCheckError( + message=f"No frequency could be detected in column '{col_name}', possibly due to uneven intervals or too many duplicate/missing values." + if not is_multiseries + else series_message, + data_check_name=self.name, + message_code=DataCheckMessageCode.DATETIME_NO_FREQUENCY_INFERRED, + ).to_dict(), + ) + else: + series_message = f"A frequency was detected in column '{col_name}' for series '{series}', but there are faulty datetime values that need to be addressed." + messages.append( + DataCheckError( + message=f"A frequency was detected in column '{col_name}', but there are faulty datetime values that need to be addressed." + if not is_multiseries + else series_message, + data_check_name=self.name, + message_code=DataCheckMessageCode.DATETIME_HAS_UNEVEN_INTERVALS, + action_options=[ + DataCheckActionOption( + DataCheckActionCode.REGULARIZE_AND_IMPUTE_DATASET, + data_check_name=self.name, + parameters={ + "time_index": { + "parameter_type": DCAOParameterType.GLOBAL, + "type": "str", + "default_value": col_name, + }, + "frequency_payload": { + "parameter_type": DCAOParameterType.GLOBAL, + "type": "tuple", + "default_value": ww_payload, + }, }, - }, - metadata={"is_target": True}, - ), - ], - ).to_dict(), - ) + metadata={"is_target": True}, + ), + ], + ).to_dict(), + ) return messages diff --git a/evalml/tests/data_checks_tests/test_datetime_format_data_check.py b/evalml/tests/data_checks_tests/test_datetime_format_data_check.py index 4d864a6eb5..f75b9a6406 100644 --- a/evalml/tests/data_checks_tests/test_datetime_format_data_check.py +++ b/evalml/tests/data_checks_tests/test_datetime_format_data_check.py @@ -18,9 +18,12 @@ THRESHOLD = 0.4 -def get_uneven_error(col_name, ww_payload): +def get_uneven_error(col_name, ww_payload, series=None): + series_message = f"A frequency was detected in column '{col_name}' for series '{series}', but there are faulty datetime values that need to be addressed." error = DataCheckError( - message=f"A frequency was detected in column '{col_name}', but there are faulty datetime values that need to be addressed.", + message=f"A frequency was detected in column '{col_name}', but there are faulty datetime values that need to be addressed." + if series is None + else series_message, data_check_name=datetime_format_check_name, message_code=DataCheckMessageCode.DATETIME_HAS_UNEVEN_INTERVALS, action_options=[ @@ -51,31 +54,49 @@ def get_uneven_error(col_name, ww_payload): "issue", ["redundant", "missing", "uneven", "type_errors", None], ) -@pytest.mark.parametrize("datetime_loc", [1, "X_index", "y_index"]) +@pytest.mark.parametrize( + "datetime_loc, is_multiseries, repeat", + [ + (1, True, 2), + (1, False, 1), + ("X_index", True, 2), + ("X_index", False, 1), + ("y_index", False, 1), + ], +) def test_datetime_format_data_check_typeerror_uneven_intervals( issue, input_type, datetime_loc, + is_multiseries, + repeat, ): - X, y = pd.DataFrame({"features": range(30)}), pd.Series(range(30)) + if is_multiseries: + time_length = 60 + else: + time_length = 30 + + X, y = pd.DataFrame({"features": range(time_length)}), pd.Series(range(time_length)) + if is_multiseries: + X["series_id"] = pd.Series(list(range(2)) * 30, dtype="str") if issue == "type_errors": - dates = range(30) + dates = range(time_length) else: - dates = pd.date_range("2021-01-01", periods=30) + dates = pd.date_range("2021-01-01", periods=time_length) if issue == "missing": # Skips 2021-01-25 and starts again at 2021-01-27, skipping a date and triggering the error - dates = pd.date_range("2021-01-01", periods=25).append( - pd.date_range("2021-01-27", periods=5), + dates = (pd.date_range("2021-01-01", periods=25).repeat(repeat)).append( + (pd.date_range("2021-01-27", periods=5).repeat(repeat)), ) if issue == "uneven": - dates_1 = pd.date_range("2015-01-01", periods=5, freq="D") - dates_2 = pd.date_range("2015-01-08", periods=3, freq="D") - dates_3 = pd.DatetimeIndex(["2015-01-12"]) - dates_4 = pd.date_range("2015-01-15", periods=5, freq="D") - dates_5 = pd.date_range("2015-01-22", periods=5, freq="D") - dates_6 = pd.date_range("2015-01-29", periods=11, freq="M") + dates_1 = pd.date_range("2015-01-01", periods=5, freq="D").repeat(repeat) + dates_2 = pd.date_range("2015-01-08", periods=3, freq="D").repeat(repeat) + dates_3 = pd.DatetimeIndex(["2015-01-12"]).repeat(repeat) + dates_4 = pd.date_range("2015-01-15", periods=5, freq="D").repeat(repeat) + dates_5 = pd.date_range("2015-01-22", periods=5, freq="D").repeat(repeat) + dates_6 = pd.date_range("2015-01-29", periods=11, freq="M").repeat(repeat) dates = ( dates_1.append(dates_2) @@ -85,11 +106,12 @@ def test_datetime_format_data_check_typeerror_uneven_intervals( .append(dates_6) ) if issue == "redundant": - dates = pd.date_range("2021-01-01", periods=29).append( - pd.date_range("2021-01-29", periods=1), + dates = (pd.date_range("2021-01-01", periods=29).repeat(repeat)).append( + (pd.date_range("2021-01-29", periods=1).repeat(repeat)), ) datetime_column = "index" + if datetime_loc == 1: X[datetime_loc] = dates datetime_column = datetime_loc @@ -102,59 +124,125 @@ def test_datetime_format_data_check_typeerror_uneven_intervals( X.ww.init() y.ww.init() - datetime_format_check = DateTimeFormatDataCheck(datetime_column=datetime_column) - - if issue == "type_errors": - assert datetime_format_check.validate(X, y) == [ - DataCheckError( - message="Datetime information could not be found in the data, or was not in a supported datetime format.", - data_check_name=datetime_format_check_name, - message_code=DataCheckMessageCode.DATETIME_INFORMATION_NOT_FOUND, - ).to_dict(), - ] - else: - if datetime_loc == "X_index": - dates = pd.Series(X.index) - elif datetime_loc == "y_index": - dates = pd.Series(y.index) - else: - dates = X[datetime_column] - ww_payload = infer_frequency( - dates, - debug=True, - window_length=WINDOW_LENGTH, - threshold=THRESHOLD, + if is_multiseries: + datetime_format_check = DateTimeFormatDataCheck( + datetime_column=datetime_column, + series_id="series_id", ) - - col_name = datetime_loc if datetime_loc == 1 else "either index" - if issue is None: - assert datetime_format_check.validate(X, y) == [] - elif issue == "missing": - assert datetime_format_check.validate(X, y) == [ - DataCheckError( - message=f"Column '{col_name}' has datetime values missing between start and end date.", - data_check_name=datetime_format_check_name, - message_code=DataCheckMessageCode.DATETIME_IS_MISSING_VALUES, - ).to_dict(), - get_uneven_error(col_name, ww_payload), - ] - elif issue == "redundant": - assert datetime_format_check.validate(X, y) == [ - DataCheckError( - message=f"Column '{col_name}' has more than one row with the same datetime value.", - data_check_name=datetime_format_check_name, - message_code=DataCheckMessageCode.DATETIME_HAS_REDUNDANT_ROW, - ).to_dict(), - get_uneven_error(col_name, ww_payload), - ] + else: + datetime_format_check = DateTimeFormatDataCheck(datetime_column=datetime_column) + + all_series = X["series_id"].unique() if is_multiseries else [0] + messages = [] + + for series in all_series: + if issue == "type_errors": + if len(messages) == 0: + # type error only gives 1 message regardless of how many series there are + messages.append( + DataCheckError( + message="Datetime information could not be found in the data, or was not in a supported datetime format.", + data_check_name=datetime_format_check_name, + message_code=DataCheckMessageCode.DATETIME_INFORMATION_NOT_FOUND, + ).to_dict(), + ) else: - assert datetime_format_check.validate(X, y) == [ - DataCheckError( - message=f"No frequency could be detected in column '{col_name}', possibly due to uneven intervals or too many duplicate/missing values.", - data_check_name=datetime_format_check_name, - message_code=DataCheckMessageCode.DATETIME_NO_FREQUENCY_INFERRED, - ).to_dict(), - ] + if is_multiseries: + curr_series_df = X[X[datetime_format_check.series_id] == series] + + # separates the datetimes so it only displays the dates that correspond to the current series + if input_type == "ww" and is_multiseries: + # ww makes the series_id into ints so need to cast series into ints + if datetime_loc == "X_index": + dates = pd.Series( + X[X[datetime_format_check.series_id] == int(series)].index, + ) + else: + dates = X[X[datetime_format_check.series_id] == int(series)][ + datetime_column + ] + elif datetime_loc == "X_index": + if is_multiseries: + dates = pd.Series(curr_series_df.index) + else: + dates = pd.Series(X.index) + elif datetime_loc == "y_index": + dates = pd.Series(y.index) + else: + if is_multiseries: + dates = pd.Series(curr_series_df[datetime_column]) + else: + dates = X[datetime_column] + ww_payload_expected = infer_frequency( + # this part might cause issues + dates.reset_index(drop=True), + debug=True, + window_length=WINDOW_LENGTH, + threshold=THRESHOLD, + ) + + col_name = datetime_loc if datetime_loc == 1 else "either index" + if issue is None: + break + elif issue == "missing": + if is_multiseries: + message = f"Column '{col_name}' for series '{series}' has datetime values missing between start and end date." + uneven_error = get_uneven_error( + col_name, + ww_payload_expected, + series, + ) + else: + message = f"Column '{col_name}' has datetime values missing between start and end date." + uneven_error = get_uneven_error(col_name, ww_payload_expected) + messages.extend( + [ + DataCheckError( + message=message, + data_check_name=datetime_format_check_name, + message_code=DataCheckMessageCode.DATETIME_IS_MISSING_VALUES, + ).to_dict(), + uneven_error, + ], + ) + elif issue == "redundant": + if is_multiseries: + message = f"Column '{col_name}' for series '{series}' has more than one row with the same datetime value." + uneven_error = get_uneven_error( + col_name, + ww_payload_expected, + series, + ) + else: + message = f"Column '{col_name}' has more than one row with the same datetime value." + uneven_error = get_uneven_error(col_name, ww_payload_expected) + messages.extend( + [ + DataCheckError( + message=message, + data_check_name=datetime_format_check_name, + message_code=DataCheckMessageCode.DATETIME_HAS_REDUNDANT_ROW, + ).to_dict(), + uneven_error, + ], + ) + else: + if is_multiseries: + message = f"No frequency could be detected in column '{col_name}' for series '{series}', possibly due to uneven intervals or too many duplicate/missing values." + else: + message = f"No frequency could be detected in column '{col_name}', possibly due to uneven intervals or too many duplicate/missing values." + + messages.append( + DataCheckError( + message=message, + data_check_name=datetime_format_check_name, + message_code=DataCheckMessageCode.DATETIME_NO_FREQUENCY_INFERRED, + ).to_dict(), + ) + if issue is None: + assert datetime_format_check.validate(X, y) == [] + else: + assert datetime_format_check.validate(X, y) == messages @pytest.mark.parametrize("sort_order", ["increasing", "decreasing", "mixed"]) @@ -202,40 +290,89 @@ def test_datetime_format_data_check_monotonic(datetime_loc, sort_order): @pytest.mark.parametrize("n_missing", [2, 5, 7]) -def test_datetime_format_data_check_multiple_missing(n_missing): +@pytest.mark.parametrize("is_multiseries, repeat", [(True, 2), (False, 1)]) +def test_datetime_format_data_check_multiple_missing(n_missing, is_multiseries, repeat): X, y = pd.DataFrame({"features": range(100)}), pd.Series(range(100)) + if is_multiseries: + X["series_id"] = pd.Series(list(range(2)) * 50, dtype="str") - dates = pd.date_range("2021-01-01", periods=15) + dates = pd.date_range("2021-01-01", periods=15).repeat(repeat) if n_missing == 2: # Two missing dates in separate spots - dates = dates.append(pd.date_range("2021-01-17", periods=86)).drop("2021-01-22") + if is_multiseries: + dates = dates.append( + pd.date_range("2021-01-17", periods=36).repeat(2), + ).drop( + "2021-01-22", + ) + else: + dates = dates.append(pd.date_range("2021-01-17", periods=86)).drop( + "2021-01-22", + ) elif n_missing == 5: # A chunk of 5 missing days in a row - dates = dates.append(pd.date_range("2021-01-21", periods=85)) + if is_multiseries: + dates = dates.append(pd.date_range("2021-01-21", periods=35).repeat(2)) + else: + dates = dates.append(pd.date_range("2021-01-21", periods=85)) else: # Some chunks missing and some alone missing - dates = dates.append(pd.date_range("2021-01-20", periods=88)).drop("2021-01-27") + if is_multiseries: + dates = dates.append( + pd.date_range("2021-01-19", periods=39).repeat(2), + ).drop( + "2021-01-27", + ) + dates = dates.drop("2021-01-20") + else: + dates = dates.append(pd.date_range("2021-01-20", periods=88)).drop( + "2021-01-27", + ) dates = dates.drop("2021-02-22") dates = dates.drop("2021-01-11") X["dates"] = dates - datetime_format_check = DateTimeFormatDataCheck(datetime_column="dates") - ww_payload = infer_frequency( - X["dates"], - debug=True, - window_length=WINDOW_LENGTH, - threshold=THRESHOLD, - ) + if is_multiseries: + datetime_format_check = DateTimeFormatDataCheck( + datetime_column="dates", + series_id="series_id", + ) + else: + datetime_format_check = DateTimeFormatDataCheck(datetime_column="dates") - assert datetime_format_check.validate(X, y) == [ - DataCheckError( - message="Column 'dates' has datetime values missing between start and end date.", - data_check_name=datetime_format_check_name, - message_code=DataCheckMessageCode.DATETIME_IS_MISSING_VALUES, - ).to_dict(), - get_uneven_error("dates", ww_payload), - ] + messages = [] + series_list = X["series_id"].unique() if is_multiseries else [0] + + for series in series_list: + observed_ts = ( + X[X["series_id"] == series]["dates"].reset_index(drop=True) + if is_multiseries + else X["dates"] + ) + ww_payload_expected = infer_frequency( + observed_ts, + debug=True, + window_length=WINDOW_LENGTH, + threshold=THRESHOLD, + ) + if is_multiseries: + message = f"""Column 'dates' for series '{series}' has datetime values missing between start and end date.""" + uneven_error = get_uneven_error("dates", ww_payload_expected, series) + else: + message = """Column 'dates' has datetime values missing between start and end date.""" + uneven_error = get_uneven_error("dates", ww_payload_expected) + messages.extend( + [ + DataCheckError( + message=message, + data_check_name=datetime_format_check_name, + message_code=DataCheckMessageCode.DATETIME_IS_MISSING_VALUES, + ).to_dict(), + uneven_error, + ], + ) + assert datetime_format_check.validate(X, y) == messages def test_datetime_format_data_check_multiple_errors(): @@ -349,35 +486,6 @@ def test_datetime_format_data_check_multiple_errors(): get_uneven_error("dates", ww_payload), ] - dates = ( - pd.date_range("2021-01-01", periods=15, freq="2D") - .drop("2021-01-13") - .append(pd.date_range("2021-01-30", periods=1)) - .append(pd.date_range("2021-01-31", periods=86, freq="2D")) - ) - X = pd.DataFrame({"dates": dates}, dtype="datetime64[ns]") - - ww_payload = infer_frequency( - X["dates"], - debug=True, - window_length=WINDOW_LENGTH, - threshold=THRESHOLD, - ) - - assert datetime_format_check.validate(X, y) == [ - DataCheckError( - message="Column 'dates' has datetime values missing between start and end date.", - data_check_name=datetime_format_check_name, - message_code=DataCheckMessageCode.DATETIME_IS_MISSING_VALUES, - ).to_dict(), - DataCheckError( - message="Column 'dates' has datetime values that do not align with the inferred frequency.", - data_check_name=datetime_format_check_name, - message_code=DataCheckMessageCode.DATETIME_HAS_MISALIGNED_VALUES, - ).to_dict(), - get_uneven_error("dates", ww_payload), - ] - def test_datetime_format_unusual_interval(): dates = pd.date_range(start="2021-01-01", periods=100, freq="4D") @@ -541,3 +649,98 @@ def test_datetime_many_duplicates_and_nans(): result = dc.validate(X, y) assert result[2]["code"] == "DATETIME_NO_FREQUENCY_INFERRED" + + +def test_datetime_format_data_check_invalid_seriesid_multiseries( + multiseries_ts_data_stacked, +): + X, y = multiseries_ts_data_stacked + datetime_format_check = DateTimeFormatDataCheck( + datetime_column="Date", + series_id="not_series_id", + ) + with pytest.raises( + ValueError, + match="""series_id "not_series_id" is not in the dataset.""", + ): + datetime_format_check.validate(X, y) + + +@pytest.mark.parametrize("nans", [0, 1, 2]) +def test_datetime_format_data_check_nan_multiseries(nans): + dates = pd.Series(pd.date_range(start="2021-01-01", periods=20).repeat(2)) + if nans == 1: + dates[0] = np.NaN + elif nans == 2: + dates[0] = np.NaN + dates[1] = np.NaN + X = pd.DataFrame(dates, columns=["date"]) + X["series_id"] = pd.Series(list(range(2)) * 20, dtype="str") + + messages = [] + for series in X["series_id"].unique(): + ww_payload_expected = infer_frequency( + X[X["series_id"] == series]["date"].reset_index(drop=True), + debug=True, + window_length=WINDOW_LENGTH, + threshold=THRESHOLD, + ) + if (series == "0" and nans >= 1) or (series == "1" and nans >= 2): + messages.extend( + [ + DataCheckError( + message=f"""Input datetime column 'date' for series '{series}' contains NaN values. Please impute NaN values or drop these rows.""", + data_check_name=DateTimeFormatDataCheck.name, + message_code=DataCheckMessageCode.DATETIME_HAS_NAN, + ).to_dict(), + get_uneven_error("date", ww_payload_expected, series), + ], + ) + + dt_nan_check = DateTimeFormatDataCheck( + datetime_column="date", + series_id="series_id", + ) + assert dt_nan_check.validate(X, pd.Series(dtype="float64")) == messages + + +def test_datetime_format_data_check_multiseries_not_aligned_frequency(): + dates = ( + pd.date_range("2021-01-01", periods=15, freq="2D") + .repeat(2) + .drop("2021-01-13") + .append(pd.date_range("2021-01-30", periods=1).repeat(2)) + .append(pd.date_range("2021-01-31", periods=35, freq="2D").repeat(2)) + ) + X = pd.DataFrame({"dates": dates}, dtype="datetime64[ns]") + X["series_id"] = pd.Series(list(range(2)) * 50, dtype="str") + datetime_format_check = DateTimeFormatDataCheck( + datetime_column="dates", + series_id="series_id", + ) + + messages = [] + for series in X["series_id"].unique(): + ww_payload_expected = infer_frequency( + X[X["series_id"] == series]["dates"].reset_index(drop=True), + debug=True, + window_length=WINDOW_LENGTH, + threshold=THRESHOLD, + ) + + messages.extend( + [ + DataCheckError( + message=f"""Column 'dates' for series '{series}' has datetime values missing between start and end date.""", + data_check_name=datetime_format_check_name, + message_code=DataCheckMessageCode.DATETIME_IS_MISSING_VALUES, + ).to_dict(), + DataCheckError( + message=f"""Column 'dates' for series '{series}' has datetime values that do not align with the inferred frequency.""", + data_check_name=datetime_format_check_name, + message_code=DataCheckMessageCode.DATETIME_HAS_MISALIGNED_VALUES, + ).to_dict(), + get_uneven_error("dates", ww_payload_expected, series), + ], + ) + assert datetime_format_check.validate(X, pd.Series(dtype="float64")) == messages