Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

perfect id column match with text possible #4203

Merged
merged 18 commits into from
Jun 15, 2023
1 change: 1 addition & 0 deletions docs/source/release_notes.rst
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ Release Notes
**Future Releases**
* Enhancements
* Fixes
* `IDColumnsDataCheck` now works with Unknown data type :pr:`4203`
* Changes
* Documentation Changes
* Testing Changes
Expand Down
10 changes: 7 additions & 3 deletions evalml/data_checks/id_columns_data_check.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
"""Data check that checks if any of the features are likely to be ID columns."""

from evalml.data_checks import (
DataCheck,
DataCheckActionCode,
Expand Down Expand Up @@ -180,16 +181,19 @@ def validate(self, X, y=None):
] # columns whose name is "id"
id_cols = {col: 0.95 for col in cols_named_id}

for dtypes in [["Double"], ["Integer", "IntegerNullable", "Categorical"]]:
X_temp = X.ww.select(include=dtypes)
for types in [
["Double"],
["Integer", "IntegerNullable", "Categorical", "Unknown"],
]:
X_temp = X.ww.select(include=types)
check_all_unique = X_temp.nunique() == len(X_temp)
cols_with_all_unique = check_all_unique[
check_all_unique
].index.tolist() # columns whose values are all unique

# Temporary solution for downstream instances of integers being mapped to doubles.
# Will be removed when resolved.
if dtypes == ["Double"]:
if types == ["Double"]:
cols_with_all_unique = [
col
for col in cols_with_all_unique
Expand Down
24 changes: 20 additions & 4 deletions evalml/tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -106,7 +106,7 @@ def graphviz():
def get_test_data_with_or_without_primary_key():
def _get_test_data_with_primary_key(input_type, has_primary_key):
X = None
if input_type == "integer":
if input_type == "Integer":
X_dict = {
"col_1_id": [0, 1, 2, 3],
"col_2": [2, 3, 4, 5],
Expand All @@ -117,7 +117,7 @@ def _get_test_data_with_primary_key(input_type, has_primary_key):
X_dict["col_1_id"] = [1, 1, 2, 3]
X = pd.DataFrame.from_dict(X_dict)

elif input_type == "integer_nullable":
elif input_type == "IntegerNullable":
X_dict = {
"col_1_id": pd.Series([0, 1, 2, 3], dtype="Int64"),
"col_2": pd.Series([2, 3, 4, 5], dtype="Int64"),
Expand All @@ -128,7 +128,7 @@ def _get_test_data_with_primary_key(input_type, has_primary_key):
X_dict["col_1_id"] = pd.Series([1, 1, 2, 3], dtype="Int64")
X = pd.DataFrame.from_dict(X_dict)

elif input_type == "double":
elif input_type == "Double":
X_dict = {
"col_1_id": [0.0, 1.0, 2.0, 3.0],
"col_2": [2, 3, 4, 5],
Expand All @@ -139,7 +139,23 @@ def _get_test_data_with_primary_key(input_type, has_primary_key):
X_dict["col_1_id"] = [1.0, 1.0, 2.0, 3.0]
X = pd.DataFrame.from_dict(X_dict)

elif input_type == "string":
elif input_type == "Unknown":
X_dict = {
"col_1_id": ["a", "b", "c", "d"],
"col_2": ["w", "x", "y", "z"],
"col_3_id": [
"123456789012345",
"234567890123456",
"3456789012345678",
"45678901234567",
],
"col_5": ["0", "0", "1", "2"],
}
if not has_primary_key:
X_dict["col_1_id"] = ["b", "b", "c", "d"]
X = pd.DataFrame.from_dict(X_dict)

elif input_type == "Categorical":
X_dict = {
"col_1_id": ["a", "b", "c", "d"],
"col_2": ["w", "x", "y", "z"],
Expand Down
16 changes: 9 additions & 7 deletions evalml/tests/data_checks_tests/test_id_columns_data_check.py
Original file line number Diff line number Diff line change
Expand Up @@ -227,7 +227,7 @@ def test_id_cols_data_check_input_formats(logical_type):

@pytest.mark.parametrize(
"input_type",
["integer", "integer_nullable", "string", "double"],
["Integer", "IntegerNullable", "Unknown", "Double", "Categorical"],
)
def test_identified_first_col_primary_key(
input_type,
Expand Down Expand Up @@ -300,7 +300,7 @@ def test_identified_first_col_primary_key(

@pytest.mark.parametrize(
"input_type",
["integer", "integer_nullable", "string", "double"],
["Integer", "IntegerNullable", "Unknown", "Double", "Categorical"],
)
def test_unidentified_first_col_primary_key(
input_type,
Expand All @@ -312,7 +312,7 @@ def test_unidentified_first_col_primary_key(
)

id_cols_check = IDColumnsDataCheck(id_threshold=0.95)
if input_type == "string":
if input_type in ["Unknown", "Categorical"]:
order = ["col_2", "col_3_id", "col_1_id"]
else:
order = ["col_2", "col_1_id", "col_3_id"]
Expand All @@ -335,13 +335,15 @@ def test_unidentified_first_col_primary_key(
]

X = X.rename(columns={"col_1_id": "col_1"})
if input_type == "integer":
if input_type == "Integer":
X.at[0, "col_1"] = 0
elif input_type == "integer_nullable":
elif input_type == "IntegerNullable":
X.at[0, "col_1"] = 0
elif input_type == "double":
elif input_type == "Double":
X.at[0, "col_1"] = 0.0
elif input_type == "string":
bchen1116 marked this conversation as resolved.
Show resolved Hide resolved
elif input_type == "Unknown":
X.at[0, "col_1"] = "a"
elif input_type == "Categorical":
X["col_1"] = X["col_1"].cat.add_categories("a")
X.at[0, "col_1"] = "a"

Expand Down