From f0bcbb99edd2e45101db9583c8fe810b815a066e Mon Sep 17 00:00:00 2001 From: Kuhu Shukla Date: Thu, 7 Jan 2021 09:48:12 -0600 Subject: [PATCH 1/3] whitelist structsiand arrays for shuffle Signed-off-by: Kuhu Shukla --- docs/supported_ops.md | 8 ++++---- integration_tests/src/main/python/array_test.py | 11 ++++++++--- integration_tests/src/main/python/data_gen.py | 1 + integration_tests/src/main/python/struct_test.py | 15 ++++++++++++--- .../com/nvidia/spark/rapids/GpuOverrides.scala | 6 ++++-- 5 files changed, 29 insertions(+), 12 deletions(-) diff --git a/docs/supported_ops.md b/docs/supported_ops.md index 6d2e06f6295..85e158655ab 100644 --- a/docs/supported_ops.md +++ b/docs/supported_ops.md @@ -352,9 +352,9 @@ Accelerator supports are described below. S NS NS +PS* (missing nested BINARY, CALENDAR, MAP, UDT) NS -NS -NS +PS* (missing nested BINARY, CALENDAR, MAP, UDT) NS @@ -536,9 +536,9 @@ Accelerator supports are described below. S NS NS +PS* (missing nested BINARY, CALENDAR, MAP, UDT) NS -NS -NS +PS* (missing nested BINARY, CALENDAR, MAP, UDT) NS diff --git a/integration_tests/src/main/python/array_test.py b/integration_tests/src/main/python/array_test.py index 309de60ae85..23b7ddf7004 100644 --- a/integration_tests/src/main/python/array_test.py +++ b/integration_tests/src/main/python/array_test.py @@ -14,11 +14,9 @@ import pytest -from asserts import assert_gpu_and_cpu_are_equal_collect +from asserts import assert_gpu_and_cpu_are_equal_collect, assert_gpu_and_cpu_are_equal_sql from data_gen import * -from marks import incompat from pyspark.sql.types import * -import pyspark.sql.functions as f # Once we support arrays as literals then we can support a[null] and # negative indexes for all array gens. When that happens @@ -46,3 +44,10 @@ def test_nested_array_index(data_gen): 'a[1]', 'a[3]', 'a[50]')) + +@pytest.mark.parametrize('data_gen', single_level_array_gens_non_decimal, ids=idfn) +def test_orderby_array(data_gen): + assert_gpu_and_cpu_are_equal_sql( + lambda spark : unary_op_df(spark, data_gen), + 'array_table', + 'select array_table.a, array_table.a[0] as first_val from array_table order by first_val') diff --git a/integration_tests/src/main/python/data_gen.py b/integration_tests/src/main/python/data_gen.py index 655ede78500..f65f472300d 100644 --- a/integration_tests/src/main/python/data_gen.py +++ b/integration_tests/src/main/python/data_gen.py @@ -767,6 +767,7 @@ def gen_scalars_for_sql(data_gen, count, seed=0, force_no_nulls=False): boolean_gens = [boolean_gen] single_level_array_gens = [ArrayGen(sub_gen) for sub_gen in all_basic_gens + decimal_gens + [null_gen]] +single_level_array_gens_non_decimal = [ArrayGen(sub_gen) for sub_gen in all_basic_gens + [null_gen]] # Be careful to not make these too large of data generation takes for ever # This is only a few nested array gens, because nesting can be very deep diff --git a/integration_tests/src/main/python/struct_test.py b/integration_tests/src/main/python/struct_test.py index 604572c0f6e..897e6bfb024 100644 --- a/integration_tests/src/main/python/struct_test.py +++ b/integration_tests/src/main/python/struct_test.py @@ -14,11 +14,9 @@ import pytest -from asserts import assert_gpu_and_cpu_are_equal_collect +from asserts import assert_gpu_and_cpu_are_equal_collect, assert_gpu_and_cpu_are_equal_sql from data_gen import * -from marks import incompat from pyspark.sql.types import * -import pyspark.sql.functions as f @pytest.mark.parametrize('data_gen', [StructGen([["first", boolean_gen], ["second", byte_gen], ["third", float_gen]]), StructGen([["first", short_gen], ["second", int_gen], ["third", long_gen]]), @@ -31,3 +29,14 @@ def test_struct_get_item(data_gen): 'a.first', 'a.second', 'a.third')) + + +@pytest.mark.parametrize('data_gen', [StructGen([["first", boolean_gen], ["second", byte_gen], ["third", float_gen]]), + StructGen([["first", short_gen], ["second", int_gen], ["third", long_gen]]), + StructGen([["first", long_gen], ["second", long_gen], ["third", long_gen]]), + StructGen([["first", string_gen], ["second", ArrayGen(string_gen)], ["third", ArrayGen(string_gen)]])], ids=idfn) +def test_orderby_struct(data_gen): + assert_gpu_and_cpu_are_equal_sql( + lambda spark : unary_op_df(spark, data_gen), + 'struct_table', + 'select struct_table.a, struct_table.a.first as first_val from struct_table order by first_val') \ No newline at end of file diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuOverrides.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuOverrides.scala index 3429bf178ec..6fa828e1ccc 100644 --- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuOverrides.scala +++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuOverrides.scala @@ -2276,7 +2276,8 @@ object GpuOverrides { }), exec[ShuffleExchangeExec]( "The backend for most data being exchanged between processes", - ExecChecks(TypeSig.commonCudfTypes + TypeSig.NULL + TypeSig.DECIMAL, TypeSig.all), + ExecChecks((TypeSig.commonCudfTypes + TypeSig.NULL + TypeSig.DECIMAL + TypeSig.ARRAY + + TypeSig.STRUCT).nested(), TypeSig.all), (shuffle, conf, p, r) => new GpuShuffleMeta(shuffle, conf, p, r)), exec[UnionExec]( "The backend for the union operator", @@ -2327,7 +2328,8 @@ object GpuOverrides { (agg, conf, p, r) => new GpuSortAggregateMeta(agg, conf, p, r)), exec[SortExec]( "The backend for the sort operator", - ExecChecks(TypeSig.commonCudfTypes + TypeSig.NULL + TypeSig.DECIMAL, TypeSig.all), + ExecChecks((TypeSig.commonCudfTypes + TypeSig.NULL + TypeSig.DECIMAL + TypeSig.ARRAY + + TypeSig.STRUCT).nested(), TypeSig.all), (sort, conf, p, r) => new GpuSortMeta(sort, conf, p, r)), exec[ExpandExec]( "The backend for the expand operator", From efce100271ea963eb46c59b81b3bc05158e03d48 Mon Sep 17 00:00:00 2001 From: Kuhu Shukla Date: Tue, 12 Jan 2021 12:43:02 -0600 Subject: [PATCH 2/3] Add more tests Signed-off-by: Kuhu Shukla --- .../src/main/python/array_test.py | 25 +++++++++++++++++-- integration_tests/src/main/python/data_gen.py | 1 - .../src/main/python/struct_test.py | 10 +++++++- .../nvidia/spark/rapids/GpuOverrides.scala | 2 ++ 4 files changed, 34 insertions(+), 4 deletions(-) diff --git a/integration_tests/src/main/python/array_test.py b/integration_tests/src/main/python/array_test.py index 23b7ddf7004..a05450ad12d 100644 --- a/integration_tests/src/main/python/array_test.py +++ b/integration_tests/src/main/python/array_test.py @@ -45,9 +45,30 @@ def test_nested_array_index(data_gen): 'a[3]', 'a[50]')) -@pytest.mark.parametrize('data_gen', single_level_array_gens_non_decimal, ids=idfn) +# Decimals with negative scale +@pytest.mark.parametrize('data_gen', single_level_array_gens, ids=idfn) def test_orderby_array(data_gen): + assert_gpu_and_cpu_are_equal_sql( + lambda spark : unary_op_df(spark, data_gen), + 'array_table', + 'select array_table.a, array_table.a[0] as first_val from array_table order by first_val', + conf=allow_negative_scale_of_decimal_conf) + + +@pytest.mark.parametrize('data_gen', [ArrayGen(ArrayGen(short_gen, max_length=10), max_length=10), + ArrayGen(ArrayGen(string_gen, max_length=10), max_length=10)], ids=idfn) +def test_orderby_array_of_arrays(data_gen): assert_gpu_and_cpu_are_equal_sql( lambda spark : unary_op_df(spark, data_gen), 'array_table', - 'select array_table.a, array_table.a[0] as first_val from array_table order by first_val') + 'select array_table.a, array_table.a[0][0] as first_val from array_table order by first_val') + + +@pytest.mark.parametrize('data_gen', [ArrayGen(StructGen([['child0', byte_gen], + ['child1', string_gen], + ['child2', float_gen]]))], ids=idfn) +def test_orderby_array_of_structs(data_gen): + assert_gpu_and_cpu_are_equal_sql( + lambda spark : unary_op_df(spark, data_gen), + 'array_table', + 'select array_table.a, array_table.a[0].child0 as first_val from array_table order by first_val') \ No newline at end of file diff --git a/integration_tests/src/main/python/data_gen.py b/integration_tests/src/main/python/data_gen.py index eb5279df4f1..56d0cee1c46 100644 --- a/integration_tests/src/main/python/data_gen.py +++ b/integration_tests/src/main/python/data_gen.py @@ -772,7 +772,6 @@ def gen_scalars_for_sql(data_gen, count, seed=0, force_no_nulls=False): boolean_gens = [boolean_gen] single_level_array_gens = [ArrayGen(sub_gen) for sub_gen in all_basic_gens + decimal_gens + [null_gen]] -single_level_array_gens_non_decimal = [ArrayGen(sub_gen) for sub_gen in all_basic_gens + [null_gen]] # Be careful to not make these too large of data generation takes for ever # This is only a few nested array gens, because nesting can be very deep diff --git a/integration_tests/src/main/python/struct_test.py b/integration_tests/src/main/python/struct_test.py index aa9b6d13494..bde3b4992e4 100644 --- a/integration_tests/src/main/python/struct_test.py +++ b/integration_tests/src/main/python/struct_test.py @@ -47,4 +47,12 @@ def test_orderby_struct(data_gen): assert_gpu_and_cpu_are_equal_sql( lambda spark : unary_op_df(spark, data_gen), 'struct_table', - 'select struct_table.a, struct_table.a.first as first_val from struct_table order by first_val') \ No newline at end of file + 'select struct_table.a, struct_table.a.first as val from struct_table order by val') + + +@pytest.mark.parametrize('data_gen', [StructGen([["first", string_gen], ["second", ArrayGen(string_gen)], ["third", ArrayGen(string_gen)]])], ids=idfn) +def test_orderby_struct_2(data_gen): + assert_gpu_and_cpu_are_equal_sql( + lambda spark : unary_op_df(spark, data_gen), + 'struct_table', + 'select struct_table.a, struct_table.a.second[0] as val from struct_table order by val') \ No newline at end of file diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuOverrides.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuOverrides.scala index d7098a4100e..0ef05470019 100644 --- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuOverrides.scala +++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuOverrides.scala @@ -2373,6 +2373,8 @@ object GpuOverrides { (agg, conf, p, r) => new GpuSortAggregateMeta(agg, conf, p, r)), exec[SortExec]( "The backend for the sort operator", + // The SortOrder TypeSig will govern what types can actually be used as sorting key data type. + // The types below are allowed as inputs and outputs. ExecChecks((TypeSig.commonCudfTypes + TypeSig.NULL + TypeSig.DECIMAL + TypeSig.ARRAY + TypeSig.STRUCT).nested(), TypeSig.all), (sort, conf, p, r) => new GpuSortMeta(sort, conf, p, r)), From 34212a3b4c879402d61c735ede61c99bdcfcfe53 Mon Sep 17 00:00:00 2001 From: Kuhu Shukla Date: Tue, 12 Jan 2021 14:16:55 -0600 Subject: [PATCH 3/3] Remove comment Signed-off-by: Kuhu Shukla --- integration_tests/src/main/python/array_test.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/integration_tests/src/main/python/array_test.py b/integration_tests/src/main/python/array_test.py index b906086da55..412f0c56ac4 100644 --- a/integration_tests/src/main/python/array_test.py +++ b/integration_tests/src/main/python/array_test.py @@ -54,7 +54,7 @@ def test_make_array(data_gen): 'array(a, b)', 'array(b, a, null, {}, {})'.format(s1, s2))) -# Decimals with negative scale + @pytest.mark.parametrize('data_gen', single_level_array_gens, ids=idfn) def test_orderby_array(data_gen): assert_gpu_and_cpu_are_equal_sql(