From 88395931297ae7a5a2730b04c096d073e373eb5b Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Tue, 14 Dec 2021 07:28:55 -0800 Subject: [PATCH 1/5] raise if dtype is not supported --- python/cudf/cudf/utils/queryutils.py | 23 ++++++++++++++++++++++- 1 file changed, 22 insertions(+), 1 deletion(-) diff --git a/python/cudf/cudf/utils/queryutils.py b/python/cudf/cudf/utils/queryutils.py index 217466a5a1b..bb4a37ad4f7 100644 --- a/python/cudf/cudf/utils/queryutils.py +++ b/python/cudf/cudf/utils/queryutils.py @@ -10,9 +10,19 @@ import cudf from cudf.core.column import column_empty from cudf.utils import applyutils +from cudf.utils.dtypes import ( + BOOL_TYPES, + DATETIME_TYPES, + NUMERIC_TYPES, + TIMEDELTA_TYPES, +) ENVREF_PREFIX = "__CUDF_ENVREF__" +SUPPORTED_QUERY_TYPES = ( + NUMERIC_TYPES + DATETIME_TYPES + TIMEDELTA_TYPES + BOOL_TYPES +) + class QuerySyntaxError(ValueError): pass @@ -197,6 +207,17 @@ def query_execute(df, expr, callenv): # compile compiled = query_compile(expr) + columns = compiled["colnames"] + + # wait to check the types until we know which cols are used + if any( + df._data[col].dtype not in SUPPORTED_QUERY_TYPES for col in columns + ): + raise TypeError( + "query only supports numeric, datetime, timedelta," + "or bool dtypes." + ) + kernel = compiled["kernel"] # process env args envargs = [] @@ -214,7 +235,7 @@ def query_execute(df, expr, callenv): raise NameError(msg.format(name)) else: envargs.append(val) - columns = compiled["colnames"] + # prepare col args colarrays = [ From e25b5601a5a400ca1753845470043b2c02af71a5 Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Thu, 16 Dec 2021 07:46:20 -0800 Subject: [PATCH 2/5] fix bugs --- python/cudf/cudf/utils/queryutils.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/python/cudf/cudf/utils/queryutils.py b/python/cudf/cudf/utils/queryutils.py index bb4a37ad4f7..926e6a01b79 100644 --- a/python/cudf/cudf/utils/queryutils.py +++ b/python/cudf/cudf/utils/queryutils.py @@ -19,9 +19,10 @@ ENVREF_PREFIX = "__CUDF_ENVREF__" -SUPPORTED_QUERY_TYPES = ( - NUMERIC_TYPES + DATETIME_TYPES + TIMEDELTA_TYPES + BOOL_TYPES -) +SUPPORTED_QUERY_TYPES = { + np.dtype(dt) + for dt in NUMERIC_TYPES | DATETIME_TYPES | TIMEDELTA_TYPES | BOOL_TYPES +} class QuerySyntaxError(ValueError): @@ -210,11 +211,12 @@ def query_execute(df, expr, callenv): columns = compiled["colnames"] # wait to check the types until we know which cols are used + breakpoint() if any( df._data[col].dtype not in SUPPORTED_QUERY_TYPES for col in columns ): raise TypeError( - "query only supports numeric, datetime, timedelta," + "query only supports numeric, datetime, timedelta, " "or bool dtypes." ) From b3d91f9b6a63ef3738095ba94c91e951aff0ce75 Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Fri, 17 Dec 2021 07:38:43 -0800 Subject: [PATCH 3/5] fix up impl --- python/cudf/cudf/utils/queryutils.py | 17 ++++++----------- 1 file changed, 6 insertions(+), 11 deletions(-) diff --git a/python/cudf/cudf/utils/queryutils.py b/python/cudf/cudf/utils/queryutils.py index 926e6a01b79..d9153c2b1d2 100644 --- a/python/cudf/cudf/utils/queryutils.py +++ b/python/cudf/cudf/utils/queryutils.py @@ -210,16 +210,18 @@ def query_execute(df, expr, callenv): compiled = query_compile(expr) columns = compiled["colnames"] + # prepare col args + colarrays = [cudf.core.dataframe.extract_col(df, col) for col in columns] + # wait to check the types until we know which cols are used - breakpoint() - if any( - df._data[col].dtype not in SUPPORTED_QUERY_TYPES for col in columns - ): + if any(col.dtype not in SUPPORTED_QUERY_TYPES for col in colarrays): raise TypeError( "query only supports numeric, datetime, timedelta, " "or bool dtypes." ) + colarrays = [col.data_array_view for col in colarrays] + kernel = compiled["kernel"] # process env args envargs = [] @@ -238,13 +240,6 @@ def query_execute(df, expr, callenv): else: envargs.append(val) - # prepare col args - - colarrays = [ - cudf.core.dataframe.extract_col(df, col).data_array_view - for col in columns - ] - # allocate output buffer nrows = len(df) out = column_empty(nrows, dtype=np.bool_) From d05f05f457481759b309ec0f207fdbd64494de63 Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Fri, 17 Dec 2021 07:55:33 -0800 Subject: [PATCH 4/5] add test --- python/cudf/cudf/tests/test_query.py | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/python/cudf/cudf/tests/test_query.py b/python/cudf/cudf/tests/test_query.py index 9a02d5145bb..aef6699965e 100644 --- a/python/cudf/cudf/tests/test_query.py +++ b/python/cudf/cudf/tests/test_query.py @@ -209,3 +209,25 @@ def test_query_with_index_keyword(query, a_val, b_val, c_val): expect = pdf.query(query) assert_eq(out, expect) + + +@pytest.mark.parametrize( + "data, query", + [ + # Only need to test the + (["a", "b", "c"], "data == 'a'"), + ], +) +def test_query_unsupported_dtypes(data, query): + gdf = cudf.DataFrame({"data": data}) + + # make sure the query works in pandas + pdf = gdf.to_pandas() + pdf_result = pdf.query(query) + + expect = pd.DataFrame({"data": ["a"]}) + assert_eq(expect, pdf_result) + + # but fails in cuDF + with pytest.raises(TypeError): + gdf.query(query) From 09132ceacf216e8ef17fb15b4c9867eaa96aeb22 Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Tue, 4 Jan 2022 09:19:24 -0800 Subject: [PATCH 5/5] finish up comment --- python/cudf/cudf/tests/test_query.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/python/cudf/cudf/tests/test_query.py b/python/cudf/cudf/tests/test_query.py index aef6699965e..3de38b2cf6f 100644 --- a/python/cudf/cudf/tests/test_query.py +++ b/python/cudf/cudf/tests/test_query.py @@ -214,7 +214,8 @@ def test_query_with_index_keyword(query, a_val, b_val, c_val): @pytest.mark.parametrize( "data, query", [ - # Only need to test the + # Only need to test the dtypes that pandas + # supports but that we do not (["a", "b", "c"], "data == 'a'"), ], )