-
Notifications
You must be signed in to change notification settings - Fork 72
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Implement basic COALESCE functionality #823
Changes from all commits
80d1a11
704e62e
497acb5
e44badb
c9e544b
cd1a147
2000932
9a76366
22cf6ad
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -573,6 +573,25 @@ def overlay(self, s, replace, start, length=None): | |
return s | ||
|
||
|
||
class CoalesceOperation(Operation): | ||
def __init__(self): | ||
super().__init__(self.coalesce) | ||
|
||
def coalesce(self, *operands): | ||
result = None | ||
for operand in operands: | ||
if is_frame(operand): | ||
# Check if frame evaluates to nan or NA | ||
if len(operand) == 1 and not operand.isnull().all().compute(): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. In what case could we have an operand where the len is 1 but it's also a interpreted as a frame? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This comes up when we pass through aggregations as input; if we attempt something like There's some additional context in #823 (comment), which I ended up resolving with 2000932 There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Makes sense. It isn't ideal but I don't think there's a better way to accomplish this today |
||
return operand if result is None else result.fillna(operand) | ||
else: | ||
result = operand if result is None else result.fillna(operand) | ||
elif not pd.isna(operand): | ||
return operand if result is None else result.fillna(operand) | ||
|
||
return result | ||
|
||
|
||
class ExtractOperation(Operation): | ||
def __init__(self): | ||
super().__init__(self.extract) | ||
|
@@ -1059,6 +1078,7 @@ class RexCallPlugin(BaseRexPlugin): | |
"substr": SubStringOperation(), | ||
"substring": SubStringOperation(), | ||
"initcap": TensorScalarOperation(lambda x: x.str.title(), lambda x: x.title()), | ||
"coalesce": CoalesceOperation(), | ||
"replace": ReplaceOperation(), | ||
# date/time operations | ||
"extract": ExtractOperation(), | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -355,6 +355,65 @@ def test_null(c): | |
assert_eq(df, expected_df) | ||
|
||
|
||
@pytest.mark.parametrize("gpu", [False, pytest.param(True, marks=pytest.mark.gpu)]) | ||
def test_coalesce(c, gpu): | ||
df = dd.from_pandas( | ||
pd.DataFrame({"a": [1, 2, 3], "b": [np.nan] * 3}), npartitions=1 | ||
) | ||
c.create_table("df", df, gpu=gpu) | ||
|
||
df = c.sql( | ||
""" | ||
SELECT | ||
COALESCE(3, 5) as c1, | ||
COALESCE(NULL, NULL) as c2, | ||
COALESCE(NULL, 'hi') as c3, | ||
COALESCE(NULL, NULL, 'bye', 5/0) as c4, | ||
COALESCE(NULL, 3/2, NULL, 'fly') as c5, | ||
COALESCE(SUM(b), 'why', 2.2) as c6, | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @ayushdg @charlesbluca This line is now failing on gpu with the newest dask-sql environment. Specifically,
throws:
This doesn't fail on CPU nor does it fail on the roughly equivalent query
Any Idea what might be happening? Could this be due to a change to cudf? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Taking a look right now There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Could you list the cuDF conda packages you're using (assuming this is using conda packages and not source)? I pulled in the latest 22.12 nightlies and wasn't able to reproduce:
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yep here they are:
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. And here's my full environment
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Hmm, I still don't seem to be able to reproduce; the only difference in my environment is that it's missing Why don't we pull in the latest changes to see if these failures crop up in gpuCI |
||
COALESCE(NULL, MEAN(b), MEAN(a), 4/0) as c7 | ||
FROM df | ||
""" | ||
) | ||
|
||
expected_df = pd.DataFrame( | ||
{ | ||
"c1": [3], | ||
"c2": [np.nan], | ||
"c3": ["hi"], | ||
"c4": ["bye"], | ||
"c5": ["1"], | ||
"c6": ["why"], | ||
"c7": [2.0], | ||
} | ||
) | ||
|
||
assert_eq(df, expected_df, check_dtype=False) | ||
|
||
df = c.sql( | ||
""" | ||
SELECT | ||
COALESCE(a, b) as c1, | ||
COALESCE(b, a) as c2, | ||
COALESCE(a, a) as c3, | ||
COALESCE(b, b) as c4 | ||
FROM df | ||
""" | ||
) | ||
|
||
expected_df = pd.DataFrame( | ||
{ | ||
"c1": [1, 2, 3], | ||
"c2": [1, 2, 3], | ||
"c3": [1, 2, 3], | ||
"c4": [np.nan] * 3, | ||
} | ||
) | ||
|
||
assert_eq(df, expected_df, check_dtype=False) | ||
c.drop_table("df") | ||
|
||
|
||
def test_boolean_operations(c): | ||
df = dd.from_pandas(pd.DataFrame({"b": [1, 0, -1]}), npartitions=1) | ||
df["b"] = df["b"].apply( | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -37,11 +37,8 @@ | |
69, | ||
70, | ||
72, | ||
75, | ||
77, | ||
78, | ||
80, | ||
84, | ||
86, | ||
87, | ||
88, | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Could we throw in a comment here giving the reasoning for this override (IIUC differences in nullable "object" columns between cuDF and pandas)?