alteryx · simha104 · Jun 15, 2023 · Jun 7, 2023 · Jun 8, 2023 · Jun 8, 2023
diff --git a/docs/source/release_notes.rst b/docs/source/release_notes.rst
@@ -3,6 +3,7 @@ Release Notes
 **Future Releases**
     * Enhancements
     * Fixes
+        * `IDColumnsDataCheck` now works with Unknown data type :pr:`4203`
     * Changes
     * Documentation Changes
     * Testing Changes

diff --git a/evalml/data_checks/id_columns_data_check.py b/evalml/data_checks/id_columns_data_check.py
@@ -1,4 +1,5 @@
 """Data check that checks if any of the features are likely to be ID columns."""
+
 from evalml.data_checks import (
     DataCheck,
     DataCheckActionCode,
@@ -180,16 +181,19 @@ def validate(self, X, y=None):
         ]  # columns whose name is "id"
         id_cols = {col: 0.95 for col in cols_named_id}
 
-        for dtypes in [["Double"], ["Integer", "IntegerNullable", "Categorical"]]:
-            X_temp = X.ww.select(include=dtypes)
+        for types in [
+            ["Double"],
+            ["Integer", "IntegerNullable", "Categorical", "Unknown"],
+        ]:
+            X_temp = X.ww.select(include=types)
             check_all_unique = X_temp.nunique() == len(X_temp)
             cols_with_all_unique = check_all_unique[
                 check_all_unique
             ].index.tolist()  # columns whose values are all unique
 
             # Temporary solution for downstream instances of integers being mapped to doubles.
             # Will be removed when resolved.
-            if dtypes == ["Double"]:
+            if types == ["Double"]:
                 cols_with_all_unique = [
                     col
                     for col in cols_with_all_unique

diff --git a/evalml/tests/conftest.py b/evalml/tests/conftest.py
@@ -106,7 +106,7 @@ def graphviz():
 def get_test_data_with_or_without_primary_key():
     def _get_test_data_with_primary_key(input_type, has_primary_key):
         X = None
-        if input_type == "integer":
+        if input_type == "Integer":
             X_dict = {
                 "col_1_id": [0, 1, 2, 3],
                 "col_2": [2, 3, 4, 5],
@@ -117,7 +117,7 @@ def _get_test_data_with_primary_key(input_type, has_primary_key):
                 X_dict["col_1_id"] = [1, 1, 2, 3]
             X = pd.DataFrame.from_dict(X_dict)
 
-        elif input_type == "integer_nullable":
+        elif input_type == "IntegerNullable":
             X_dict = {
                 "col_1_id": pd.Series([0, 1, 2, 3], dtype="Int64"),
                 "col_2": pd.Series([2, 3, 4, 5], dtype="Int64"),
@@ -128,7 +128,7 @@ def _get_test_data_with_primary_key(input_type, has_primary_key):
                 X_dict["col_1_id"] = pd.Series([1, 1, 2, 3], dtype="Int64")
             X = pd.DataFrame.from_dict(X_dict)
 
-        elif input_type == "double":
+        elif input_type == "Double":
             X_dict = {
                 "col_1_id": [0.0, 1.0, 2.0, 3.0],
                 "col_2": [2, 3, 4, 5],
@@ -139,7 +139,23 @@ def _get_test_data_with_primary_key(input_type, has_primary_key):
                 X_dict["col_1_id"] = [1.0, 1.0, 2.0, 3.0]
             X = pd.DataFrame.from_dict(X_dict)
 
-        elif input_type == "string":
+        elif input_type == "Unknown":
+            X_dict = {
+                "col_1_id": ["a", "b", "c", "d"],
+                "col_2": ["w", "x", "y", "z"],
+                "col_3_id": [
+                    "123456789012345",
+                    "234567890123456",
+                    "3456789012345678",
+                    "45678901234567",
+                ],
+                "col_5": ["0", "0", "1", "2"],
+            }
+            if not has_primary_key:
+                X_dict["col_1_id"] = ["b", "b", "c", "d"]
+            X = pd.DataFrame.from_dict(X_dict)
+
+        elif input_type == "Categorical":
             X_dict = {
                 "col_1_id": ["a", "b", "c", "d"],
                 "col_2": ["w", "x", "y", "z"],

diff --git a/evalml/tests/data_checks_tests/test_id_columns_data_check.py b/evalml/tests/data_checks_tests/test_id_columns_data_check.py
@@ -227,7 +227,7 @@ def test_id_cols_data_check_input_formats(logical_type):
 
 @pytest.mark.parametrize(
     "input_type",
-    ["integer", "integer_nullable", "string", "double"],
+    ["Integer", "IntegerNullable", "Unknown", "Double", "Categorical"],
 )
 def test_identified_first_col_primary_key(
     input_type,
@@ -300,7 +300,7 @@ def test_identified_first_col_primary_key(
 
 @pytest.mark.parametrize(
     "input_type",
-    ["integer", "integer_nullable", "string", "double"],
+    ["Integer", "IntegerNullable", "Unknown", "Double", "Categorical"],
 )
 def test_unidentified_first_col_primary_key(
     input_type,
@@ -312,7 +312,7 @@ def test_unidentified_first_col_primary_key(
     )
 
     id_cols_check = IDColumnsDataCheck(id_threshold=0.95)
-    if input_type == "string":
+    if input_type in ["Unknown", "Categorical"]:
         order = ["col_2", "col_3_id", "col_1_id"]
     else:
         order = ["col_2", "col_1_id", "col_3_id"]
@@ -335,13 +335,15 @@ def test_unidentified_first_col_primary_key(
     ]
 
     X = X.rename(columns={"col_1_id": "col_1"})
-    if input_type == "integer":
+    if input_type == "Integer":
         X.at[0, "col_1"] = 0
-    elif input_type == "integer_nullable":
+    elif input_type == "IntegerNullable":
         X.at[0, "col_1"] = 0
-    elif input_type == "double":
+    elif input_type == "Double":
         X.at[0, "col_1"] = 0.0
-    elif input_type == "string":
+    elif input_type == "Unknown":
+        X.at[0, "col_1"] = "a"
+    elif input_type == "Categorical":
         X["col_1"] = X["col_1"].cat.add_categories("a")
         X.at[0, "col_1"] = "a"