-
Notifications
You must be signed in to change notification settings - Fork 915
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Column equality testing fixes #10011
Changes from 7 commits
a77f861
812577b
4562f75
5517835
f088adf
042a4ed
7c1b40d
2644898
599253d
a44b97f
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -9,7 +9,12 @@ | |
import pandas as pd | ||
|
||
import cudf | ||
from cudf.api.types import is_categorical_dtype, is_numeric_dtype | ||
from cudf._lib.unary import is_nan | ||
from cudf.api.types import ( | ||
is_categorical_dtype, | ||
is_numeric_dtype, | ||
is_string_dtype, | ||
) | ||
from cudf.core._compat import PANDAS_GE_110 | ||
|
||
|
||
|
@@ -201,31 +206,49 @@ def assert_column_equal( | |
): | ||
left = left.astype(left.categories.dtype) | ||
right = right.astype(right.categories.dtype) | ||
|
||
columns_equal = False | ||
try: | ||
columns_equal = ( | ||
( | ||
cp.all(left.isnull().values == right.isnull().values) | ||
and cp.allclose( | ||
if left.size == right.size == 0: | ||
columns_equal = True | ||
elif not ( | ||
(is_string_dtype(left) and is_numeric_dtype(right)) | ||
or (is_numeric_dtype(left) and is_string_dtype(right)) | ||
): | ||
try: | ||
# nulls must be in the same places for all dtypes | ||
nulls_equal = cp.all(left.isnull().values == right.isnull().values) | ||
vyasr marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
||
if not check_exact and is_numeric_dtype(left): | ||
# non-null values must be the same | ||
values_equal = cp.allclose( | ||
left[left.isnull().unary_operator("not")].values, | ||
right[right.isnull().unary_operator("not")].values, | ||
) | ||
) | ||
if not check_exact and is_numeric_dtype(left) | ||
else left.equals(right) | ||
) | ||
except TypeError as e: | ||
if str(e) != "Categoricals can only compare with the same type": | ||
raise e | ||
if is_categorical_dtype(left) and is_categorical_dtype(right): | ||
left = left.astype(left.categories.dtype) | ||
right = right.astype(right.categories.dtype) | ||
if not left.dtype.kind == right.dtype.kind == "f": | ||
columns_equal = nulls_equal and values_equal | ||
else: | ||
# nans must be the same for float types | ||
nans_equal = cp.all( | ||
is_nan(left).values == is_nan(right).values | ||
) | ||
columns_equal = nulls_equal and values_equal and nans_equal | ||
else: | ||
columns_equal = left.equals(right) | ||
except TypeError as e: | ||
if str(e) != "Categoricals can only compare with the same type": | ||
raise e | ||
if is_categorical_dtype(left) and is_categorical_dtype(right): | ||
left = left.astype(left.categories.dtype) | ||
right = right.astype(right.categories.dtype) | ||
vyasr marked this conversation as resolved.
Show resolved
Hide resolved
|
||
if not columns_equal: | ||
msg1 = f"{left.values_host}" | ||
msg2 = f"{right.values_host}" | ||
ldata = [val for val in left.to_pandas(nullable=True)] | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Side question - I've wondered how reliable There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It seems like they're being fairly forward with these as of 1.2.0 |
||
rdata = [val for val in right.to_pandas(nullable=True)] | ||
msg1 = f"{ldata}" | ||
msg2 = f"{rdata}" | ||
vyasr marked this conversation as resolved.
Show resolved
Hide resolved
|
||
try: | ||
diff = left.apply_boolean_mask(left != right).size | ||
diff = 0 | ||
for i in range(left.size): | ||
if not null_safe_scalar_equals(left[i], right[i]): | ||
diff += 1 | ||
diff = diff * 100.0 / left.size | ||
except BaseException: | ||
diff = 100.0 | ||
|
@@ -234,6 +257,12 @@ def assert_column_equal( | |
) | ||
|
||
|
||
def null_safe_scalar_equals(left, right): | ||
if left in {cudf.NA, np.nan} or right in {cudf.NA, np.nan}: | ||
return left is right | ||
return left == right | ||
vyasr marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
||
|
||
def assert_index_equal( | ||
left, | ||
right, | ||
|
@@ -358,7 +387,6 @@ def assert_index_equal( | |
obj=mul_obj, | ||
) | ||
return | ||
|
||
assert_column_equal( | ||
left._columns[0], | ||
right._columns[0], | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Not sure if I follow these checks. Where the handler if the input falls in these dtypes?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yeah this seems odd. Are we just generally looking to avoid checking this for mismatched dtypes?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
If one column is string and the other is numeric, we can assume the columns are not equal. (
1 != '1')
. These lines check to make sure exactly one of the columns is string and the other is numeric, in which case we avoid the entire try/except block and therefore avoid any opportunity forcolumns_equal
to be set toTrue
. We should end up on line 243 from there.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Is this only an issue for string types, or do we need to worry about other types as well? I see categoricals are handled above, what about list/struct dtypes?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I looked into what happens here and it would appear that when
list
orstruct
are involved we fall into theleft.equals(right)
check at the very end and end up with aTypeError
, except in the case that we're comparinglist
tolist
orstruct
tostruct
. That should probably not happen.I see the point I think you are making though: there's a certain set of dtypes (beyond just string) where, even if
check_dtype=False
, we know up front that it's a 100% mismatch between non-null elements simply because a struct can't compare as equal to a "not struct". I think these dtypes are:I suppose the only edge case here would be that we should arguably return
True
even for that subset of dtypes if the columns are fully null.If this seems like the correct logic I am happy to go back and wire it up as such here.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yep that all sounds good. For the fully null case, I think whether or not we return True should be determined by
check_dtypes
, but if someone has a different opinion I'm open to a different result.