From 57d5c8eff8fea5db57291455a5327d2036666bba Mon Sep 17 00:00:00 2001 From: Firestarman Date: Tue, 29 Jun 2021 14:54:15 +0800 Subject: [PATCH 01/15] Replace toTitle with capitalize for GpuInitCap Signed-off-by: Firestarman --- .../src/main/python/string_test.py | 17 +---------------- .../com/nvidia/spark/rapids/GpuOverrides.scala | 5 +---- .../spark/sql/rapids/stringFunctions.scala | 5 ++++- 3 files changed, 6 insertions(+), 21 deletions(-) diff --git a/integration_tests/src/main/python/string_test.py b/integration_tests/src/main/python/string_test.py index ba007efaeb2..58fcfc2ed86 100644 --- a/integration_tests/src/main/python/string_test.py +++ b/integration_tests/src/main/python/string_test.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. +# Copyright (c) 2020-2021, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -314,27 +314,12 @@ def test_length(): 'CHAR_LENGTH(a)', 'CHARACTER_LENGTH(a)')) -# Once the xfail is fixed this can replace test_initcap_space -@incompat -@pytest.mark.xfail(reason='https://github.com/NVIDIA/spark-rapids/issues/120') def test_initcap(): - # Because we don't use the same unicode version we need to limit - # the charicter set to something more reasonable - # upper and lower should cover the corner cases, this is mostly to - # see if there are issues with spaces gen = mk_str_gen('([aAbB]{0,5}[ \r\n\t]{1,2}){1,5}') assert_gpu_and_cpu_are_equal_collect( lambda spark: unary_op_df(spark, gen).select( f.initcap(f.col('a')))) -@incompat -def test_initcap_space(): - # we see a lot more space delim - gen = StringGen('([aAbB]{0,5}[ ]{1,2}){1,5}') - assert_gpu_and_cpu_are_equal_collect( - lambda spark: unary_op_df(spark, gen).select( - f.initcap(f.col('a')))) - @pytest.mark.xfail(reason='https://github.com/NVIDIA/spark-rapids/issues/119') def test_like_null_xfail(): gen = mk_str_gen('.{0,3}a[|b*.$\r\n]{0,2}c.{0,3}')\ diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuOverrides.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuOverrides.scala index 8a3a1fbeb2c..21b0a1f3e35 100644 --- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuOverrides.scala +++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuOverrides.scala @@ -1250,10 +1250,7 @@ object GpuOverrides { ExprChecks.unaryProjectNotLambdaInputMatchesOutput(TypeSig.STRING, TypeSig.STRING), (a, conf, p, r) => new UnaryExprMeta[InitCap](a, conf, p, r) { override def convertToGpu(child: Expression): GpuExpression = GpuInitCap(child) - }).incompat(CASE_MODIFICATION_INCOMPAT + " Spark also only sees the space character as " + - "a word deliminator, but this will capitalize any character after a non-alphabetic " + - "character. The behavior will be aligned to match Spark in the future per " + - "https://github.com/NVIDIA/spark-rapids/issues/2786."), + }).incompat(CASE_MODIFICATION_INCOMPAT), expr[Log]( "Natural log", ExprChecks.mathUnary, diff --git a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/stringFunctions.scala b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/stringFunctions.scala index 0a905e0fda1..b3822b4a372 100644 --- a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/stringFunctions.scala +++ b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/stringFunctions.scala @@ -481,7 +481,10 @@ case class GpuInitCap(child: Expression) extends GpuUnaryExpression with Implici override def inputTypes: Seq[DataType] = Seq(StringType) override def dataType: DataType = StringType override protected def doColumnar(input: GpuColumnVector): ColumnVector = - input.getBase.toTitle + withResource(Scalar.fromString(" ")) { space => + // Spark only sees the space character as a word deliminator. + input.getBase.capitalize(space) + } } case class GpuStringReplace( From 689569a0bbf82923c8be5d7d1d565962542b350c Mon Sep 17 00:00:00 2001 From: Firestarman Date: Tue, 29 Jun 2021 15:20:03 +0800 Subject: [PATCH 02/15] Restore the comment Signed-off-by: Firestarman --- integration_tests/src/main/python/string_test.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/integration_tests/src/main/python/string_test.py b/integration_tests/src/main/python/string_test.py index 58fcfc2ed86..d0606a29631 100644 --- a/integration_tests/src/main/python/string_test.py +++ b/integration_tests/src/main/python/string_test.py @@ -315,6 +315,8 @@ def test_length(): 'CHARACTER_LENGTH(a)')) def test_initcap(): + # Because we don't use the same unicode version we need to limit + # the charicter set to something more reasonable gen = mk_str_gen('([aAbB]{0,5}[ \r\n\t]{1,2}){1,5}') assert_gpu_and_cpu_are_equal_collect( lambda spark: unary_op_df(spark, gen).select( From f8c67edf0c3a2eb0637578e08aef766ad683e5da Mon Sep 17 00:00:00 2001 From: Firestarman Date: Tue, 29 Jun 2021 15:21:42 +0800 Subject: [PATCH 03/15] more comment Signed-off-by: Firestarman --- integration_tests/src/main/python/string_test.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/integration_tests/src/main/python/string_test.py b/integration_tests/src/main/python/string_test.py index d0606a29631..78c3808bf25 100644 --- a/integration_tests/src/main/python/string_test.py +++ b/integration_tests/src/main/python/string_test.py @@ -317,6 +317,8 @@ def test_length(): def test_initcap(): # Because we don't use the same unicode version we need to limit # the charicter set to something more reasonable + # upper and lower should cover the corner cases, this is mostly to + # see if there are issues with spaces gen = mk_str_gen('([aAbB]{0,5}[ \r\n\t]{1,2}){1,5}') assert_gpu_and_cpu_are_equal_collect( lambda spark: unary_op_df(spark, gen).select( From f2a1e2aa63cef130bda94a8f80365943a1187723 Mon Sep 17 00:00:00 2001 From: Firestarman Date: Wed, 30 Jun 2021 14:15:26 +0800 Subject: [PATCH 04/15] Address the comments Signed-off-by: Firestarman --- integration_tests/src/main/python/string_test.py | 2 +- .../scala/com/nvidia/spark/rapids/GpuOverrides.scala | 12 +++--------- 2 files changed, 4 insertions(+), 10 deletions(-) diff --git a/integration_tests/src/main/python/string_test.py b/integration_tests/src/main/python/string_test.py index 78c3808bf25..aa1f3b6ca88 100644 --- a/integration_tests/src/main/python/string_test.py +++ b/integration_tests/src/main/python/string_test.py @@ -319,7 +319,7 @@ def test_initcap(): # the charicter set to something more reasonable # upper and lower should cover the corner cases, this is mostly to # see if there are issues with spaces - gen = mk_str_gen('([aAbB]{0,5}[ \r\n\t]{1,2}){1,5}') + gen = mk_str_gen('([aAbB1357_@%-ȺéʼnŸ]{0,5}[ \r\n\t]{1,2}){1,5}') assert_gpu_and_cpu_are_equal_collect( lambda spark: unary_op_df(spark, gen).select( f.initcap(f.col('a')))) diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuOverrides.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuOverrides.scala index 21b0a1f3e35..f2bd257db5b 100644 --- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuOverrides.scala +++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuOverrides.scala @@ -432,10 +432,6 @@ object GpuOverrides { val FLOAT_DIFFERS_GROUP_INCOMPAT = "when enabling these, there may be extra groups produced for floating point grouping " + "keys (e.g. -0.0, and 0.0)" - val CASE_MODIFICATION_INCOMPAT = - "in some cases unicode characters change byte width when changing the case. The GPU string " + - "conversion does not support these characters. For a full list of unsupported characters " + - "see https://github.com/rapidsai/cudf/issues/3132" val UTC_TIMEZONE_ID = ZoneId.of("UTC").normalized() // Based on https://docs.oracle.com/javase/8/docs/api/java/util/regex/Pattern.html private[this] lazy val regexList: Seq[String] = Seq("\\", "\u0000", "\\x", "\t", "\n", "\r", @@ -1250,7 +1246,7 @@ object GpuOverrides { ExprChecks.unaryProjectNotLambdaInputMatchesOutput(TypeSig.STRING, TypeSig.STRING), (a, conf, p, r) => new UnaryExprMeta[InitCap](a, conf, p, r) { override def convertToGpu(child: Expression): GpuExpression = GpuInitCap(child) - }).incompat(CASE_MODIFICATION_INCOMPAT), + }), expr[Log]( "Natural log", ExprChecks.mathUnary, @@ -2198,15 +2194,13 @@ object GpuOverrides { ExprChecks.unaryProjectNotLambdaInputMatchesOutput(TypeSig.STRING, TypeSig.STRING), (a, conf, p, r) => new UnaryExprMeta[Upper](a, conf, p, r) { override def convertToGpu(child: Expression): GpuExpression = GpuUpper(child) - }) - .incompat(CASE_MODIFICATION_INCOMPAT), + }), expr[Lower]( "String lowercase operator", ExprChecks.unaryProjectNotLambdaInputMatchesOutput(TypeSig.STRING, TypeSig.STRING), (a, conf, p, r) => new UnaryExprMeta[Lower](a, conf, p, r) { override def convertToGpu(child: Expression): GpuExpression = GpuLower(child) - }) - .incompat(CASE_MODIFICATION_INCOMPAT), + }), expr[StringLPad]( "Pad a string on the left", ExprChecks.projectNotLambda(TypeSig.STRING, TypeSig.STRING, From cc1c353d9ccf6fd39e5c7b0c2e375739c10eec9e Mon Sep 17 00:00:00 2001 From: Firestarman Date: Wed, 30 Jun 2021 14:25:23 +0800 Subject: [PATCH 05/15] Longer size for string gen Signed-off-by: Firestarman --- integration_tests/src/main/python/string_test.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/integration_tests/src/main/python/string_test.py b/integration_tests/src/main/python/string_test.py index aa1f3b6ca88..650b9743255 100644 --- a/integration_tests/src/main/python/string_test.py +++ b/integration_tests/src/main/python/string_test.py @@ -319,7 +319,7 @@ def test_initcap(): # the charicter set to something more reasonable # upper and lower should cover the corner cases, this is mostly to # see if there are issues with spaces - gen = mk_str_gen('([aAbB1357_@%-ȺéʼnŸ]{0,5}[ \r\n\t]{1,2}){1,5}') + gen = mk_str_gen('([aAbB1357_@%-ȺéʼnŸ]{0,16}[ \r\n\t]{1,2}){1,5}') assert_gpu_and_cpu_are_equal_collect( lambda spark: unary_op_df(spark, gen).select( f.initcap(f.col('a')))) From 1e30a96f3e360a56f883877c65521347279550c3 Mon Sep 17 00:00:00 2001 From: Firestarman Date: Thu, 1 Jul 2021 17:22:32 +0800 Subject: [PATCH 06/15] Address new comments Signed-off-by: Firestarman --- integration_tests/src/main/python/string_test.py | 3 ++- .../scala/com/nvidia/spark/rapids/GpuOverrides.scala | 11 ++++++++--- 2 files changed, 10 insertions(+), 4 deletions(-) diff --git a/integration_tests/src/main/python/string_test.py b/integration_tests/src/main/python/string_test.py index 650b9743255..1eb21407b5e 100644 --- a/integration_tests/src/main/python/string_test.py +++ b/integration_tests/src/main/python/string_test.py @@ -314,12 +314,13 @@ def test_length(): 'CHAR_LENGTH(a)', 'CHARACTER_LENGTH(a)')) +@incompat def test_initcap(): # Because we don't use the same unicode version we need to limit # the charicter set to something more reasonable # upper and lower should cover the corner cases, this is mostly to # see if there are issues with spaces - gen = mk_str_gen('([aAbB1357_@%-ȺéʼnŸ]{0,16}[ \r\n\t]{1,2}){1,5}') + gen = mk_str_gen('([aAbB1357_@%-]{0,12}[ \r\n\t]{1,2}){1,5}') assert_gpu_and_cpu_are_equal_collect( lambda spark: unary_op_df(spark, gen).select( f.initcap(f.col('a')))) diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuOverrides.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuOverrides.scala index f2bd257db5b..6fe8c89ceee 100644 --- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuOverrides.scala +++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuOverrides.scala @@ -432,6 +432,9 @@ object GpuOverrides { val FLOAT_DIFFERS_GROUP_INCOMPAT = "when enabling these, there may be extra groups produced for floating point grouping " + "keys (e.g. -0.0, and 0.0)" + val CASE_MODIFICATION_INCOMPAT = + "it will not be always 100% compatible with CPU because the unicode standards used by" + + " cuDF and JVM may differ." val UTC_TIMEZONE_ID = ZoneId.of("UTC").normalized() // Based on https://docs.oracle.com/javase/8/docs/api/java/util/regex/Pattern.html private[this] lazy val regexList: Seq[String] = Seq("\\", "\u0000", "\\x", "\t", "\n", "\r", @@ -1246,7 +1249,7 @@ object GpuOverrides { ExprChecks.unaryProjectNotLambdaInputMatchesOutput(TypeSig.STRING, TypeSig.STRING), (a, conf, p, r) => new UnaryExprMeta[InitCap](a, conf, p, r) { override def convertToGpu(child: Expression): GpuExpression = GpuInitCap(child) - }), + }).incompat(CASE_MODIFICATION_INCOMPAT), expr[Log]( "Natural log", ExprChecks.mathUnary, @@ -2194,13 +2197,15 @@ object GpuOverrides { ExprChecks.unaryProjectNotLambdaInputMatchesOutput(TypeSig.STRING, TypeSig.STRING), (a, conf, p, r) => new UnaryExprMeta[Upper](a, conf, p, r) { override def convertToGpu(child: Expression): GpuExpression = GpuUpper(child) - }), + }) + .incompat(CASE_MODIFICATION_INCOMPAT), expr[Lower]( "String lowercase operator", ExprChecks.unaryProjectNotLambdaInputMatchesOutput(TypeSig.STRING, TypeSig.STRING), (a, conf, p, r) => new UnaryExprMeta[Lower](a, conf, p, r) { override def convertToGpu(child: Expression): GpuExpression = GpuLower(child) - }), + }) + .incompat(CASE_MODIFICATION_INCOMPAT), expr[StringLPad]( "Pad a string on the left", ExprChecks.projectNotLambda(TypeSig.STRING, TypeSig.STRING, From dd072675eff0d355cc2d13419036e5b692c6e60b Mon Sep 17 00:00:00 2001 From: Firestarman Date: Thu, 1 Jul 2021 17:24:47 +0800 Subject: [PATCH 07/15] comment update Signed-off-by: Firestarman --- .../src/main/scala/com/nvidia/spark/rapids/GpuOverrides.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuOverrides.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuOverrides.scala index 6fe8c89ceee..e344d1b1033 100644 --- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuOverrides.scala +++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuOverrides.scala @@ -433,7 +433,7 @@ object GpuOverrides { "when enabling these, there may be extra groups produced for floating point grouping " + "keys (e.g. -0.0, and 0.0)" val CASE_MODIFICATION_INCOMPAT = - "it will not be always 100% compatible with CPU because the unicode standards used by" + + "it will not be always 100% compatible with Spark because the unicode standards used by" + " cuDF and JVM may differ." val UTC_TIMEZONE_ID = ZoneId.of("UTC").normalized() // Based on https://docs.oracle.com/javase/8/docs/api/java/util/regex/Pattern.html From 17f88c9de2bb1089232a0c88f7fb312a31155452 Mon Sep 17 00:00:00 2001 From: Liangcai Li Date: Fri, 2 Jul 2021 09:53:38 +0800 Subject: [PATCH 08/15] Update sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuOverrides.scala Improve the incompat doc Co-authored-by: Jason Lowe --- .../src/main/scala/com/nvidia/spark/rapids/GpuOverrides.scala | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuOverrides.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuOverrides.scala index e344d1b1033..bdc1574f3da 100644 --- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuOverrides.scala +++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuOverrides.scala @@ -433,8 +433,8 @@ object GpuOverrides { "when enabling these, there may be extra groups produced for floating point grouping " + "keys (e.g. -0.0, and 0.0)" val CASE_MODIFICATION_INCOMPAT = - "it will not be always 100% compatible with Spark because the unicode standards used by" + - " cuDF and JVM may differ." + "the Unicode version used by cuDF and the JVM may differ, resulting in some " + + "corner-case characters not changing case correctly." val UTC_TIMEZONE_ID = ZoneId.of("UTC").normalized() // Based on https://docs.oracle.com/javase/8/docs/api/java/util/regex/Pattern.html private[this] lazy val regexList: Seq[String] = Seq("\\", "\u0000", "\\x", "\t", "\n", "\r", From 613284f475cc9ac79f55ac9c7af8d36c26b81d4e Mon Sep 17 00:00:00 2001 From: Firestarman Date: Fri, 2 Jul 2021 09:59:45 +0800 Subject: [PATCH 09/15] More characters Signed-off-by: Firestarman --- integration_tests/src/main/python/string_test.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/integration_tests/src/main/python/string_test.py b/integration_tests/src/main/python/string_test.py index 1eb21407b5e..a18c1af3af7 100644 --- a/integration_tests/src/main/python/string_test.py +++ b/integration_tests/src/main/python/string_test.py @@ -320,7 +320,7 @@ def test_initcap(): # the charicter set to something more reasonable # upper and lower should cover the corner cases, this is mostly to # see if there are issues with spaces - gen = mk_str_gen('([aAbB1357_@%-]{0,12}[ \r\n\t]{1,2}){1,5}') + gen = mk_str_gen('([aAbB1357_@%-ȺéʼnŸ]{0,16}[ \r\n\t]{1,2}){1,5}') assert_gpu_and_cpu_are_equal_collect( lambda spark: unary_op_df(spark, gen).select( f.initcap(f.col('a')))) From 188e4092ee58d2a261404a7974e814387472a6f2 Mon Sep 17 00:00:00 2001 From: Firestarman Date: Fri, 2 Jul 2021 12:10:16 +0800 Subject: [PATCH 10/15] Doc update Signed-off-by: Firestarman --- docs/configs.md | 6 +++--- docs/supported_ops.md | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/docs/configs.md b/docs/configs.md index b17b4c99439..2d37d1b1853 100644 --- a/docs/configs.md +++ b/docs/configs.md @@ -197,7 +197,7 @@ Name | SQL Function(s) | Description | Default Value | Notes spark.rapids.sql.expression.If|`if`|IF expression|true|None| spark.rapids.sql.expression.In|`in`|IN operator|true|None| spark.rapids.sql.expression.InSet| |INSET operator|true|None| -spark.rapids.sql.expression.InitCap|`initcap`|Returns str with the first letter of each word in uppercase. All other letters are in lowercase|false|This is not 100% compatible with the Spark version because in some cases unicode characters change byte width when changing the case. The GPU string conversion does not support these characters. For a full list of unsupported characters see https://github.com/rapidsai/cudf/issues/3132 Spark also only sees the space character as a word deliminator, but this will capitalize any character after a non-alphabetic character. The behavior will be aligned to match Spark in the future per https://github.com/NVIDIA/spark-rapids/issues/2786.| +spark.rapids.sql.expression.InitCap|`initcap`|Returns str with the first letter of each word in uppercase. All other letters are in lowercase|false|This is not 100% compatible with the Spark version because the Unicode version used by cuDF and the JVM may differ, resulting in some corner-case characters not changing case correctly.| spark.rapids.sql.expression.InputFileBlockLength|`input_file_block_length`|Returns the length of the block being read, or -1 if not available|true|None| spark.rapids.sql.expression.InputFileBlockStart|`input_file_block_start`|Returns the start offset of the block being read, or -1 if not available|true|None| spark.rapids.sql.expression.InputFileName|`input_file_name`|Returns the name of the file being read, or empty string if not available|true|None| @@ -221,7 +221,7 @@ Name | SQL Function(s) | Description | Default Value | Notes spark.rapids.sql.expression.Log1p|`log1p`|Natural log 1 + expr|true|None| spark.rapids.sql.expression.Log2|`log2`|Log base 2|true|None| spark.rapids.sql.expression.Logarithm|`log`|Log variable base|true|None| -spark.rapids.sql.expression.Lower|`lower`, `lcase`|String lowercase operator|false|This is not 100% compatible with the Spark version because in some cases unicode characters change byte width when changing the case. The GPU string conversion does not support these characters. For a full list of unsupported characters see https://github.com/rapidsai/cudf/issues/3132| +spark.rapids.sql.expression.Lower|`lower`, `lcase`|String lowercase operator|false|This is not 100% compatible with the Spark version because the Unicode version used by cuDF and the JVM may differ, resulting in some corner-case characters not changing case correctly.| spark.rapids.sql.expression.MakeDecimal| |Create a Decimal from an unscaled long value for some aggregation optimizations|true|None| spark.rapids.sql.expression.Md5|`md5`|MD5 hash operator|true|None| spark.rapids.sql.expression.Minute|`minute`|Returns the minute component of the string/timestamp|true|None| @@ -283,7 +283,7 @@ Name | SQL Function(s) | Description | Default Value | Notes spark.rapids.sql.expression.UnboundedPreceding$| |Special boundary for a window frame, indicating all rows preceding the current row|true|None| spark.rapids.sql.expression.UnixTimestamp|`unix_timestamp`|Returns the UNIX timestamp of current or specified time|true|None| spark.rapids.sql.expression.UnscaledValue| |Convert a Decimal to an unscaled long value for some aggregation optimizations|true|None| -spark.rapids.sql.expression.Upper|`upper`, `ucase`|String uppercase operator|false|This is not 100% compatible with the Spark version because in some cases unicode characters change byte width when changing the case. The GPU string conversion does not support these characters. For a full list of unsupported characters see https://github.com/rapidsai/cudf/issues/3132| +spark.rapids.sql.expression.Upper|`upper`, `ucase`|String uppercase operator|false|This is not 100% compatible with the Spark version because the Unicode version used by cuDF and the JVM may differ, resulting in some corner-case characters not changing case correctly.| spark.rapids.sql.expression.WeekDay|`weekday`|Returns the day of the week (0 = Monday...6=Sunday)|true|None| spark.rapids.sql.expression.WindowExpression| |Calculates a return value for every input row of a table based on a group (or "window") of rows|true|None| spark.rapids.sql.expression.WindowSpecDefinition| |Specification of a window function, indicating the partitioning-expression, the row ordering, and the width of the window|true|None| diff --git a/docs/supported_ops.md b/docs/supported_ops.md index 001fe0c2e98..0201f6ccf86 100644 --- a/docs/supported_ops.md +++ b/docs/supported_ops.md @@ -8013,7 +8013,7 @@ Accelerator support is described below. InitCap `initcap` Returns str with the first letter of each word in uppercase. All other letters are in lowercase -This is not 100% compatible with the Spark version because in some cases unicode characters change byte width when changing the case. The GPU string conversion does not support these characters. For a full list of unsupported characters see https://github.com/rapidsai/cudf/issues/3132 Spark also only sees the space character as a word deliminator, but this will capitalize any character after a non-alphabetic character. The behavior will be aligned to match Spark in the future per https://github.com/NVIDIA/spark-rapids/issues/2786. +This is not 100% compatible with the Spark version because the Unicode version used by cuDF and the JVM may differ, resulting in some corner-case characters not changing case correctly. project input @@ -10343,7 +10343,7 @@ Accelerator support is described below. Lower `lower`, `lcase` String lowercase operator -This is not 100% compatible with the Spark version because in some cases unicode characters change byte width when changing the case. The GPU string conversion does not support these characters. For a full list of unsupported characters see https://github.com/rapidsai/cudf/issues/3132 +This is not 100% compatible with the Spark version because the Unicode version used by cuDF and the JVM may differ, resulting in some corner-case characters not changing case correctly. project input @@ -17519,7 +17519,7 @@ Accelerator support is described below. Upper `upper`, `ucase` String uppercase operator -This is not 100% compatible with the Spark version because in some cases unicode characters change byte width when changing the case. The GPU string conversion does not support these characters. For a full list of unsupported characters see https://github.com/rapidsai/cudf/issues/3132 +This is not 100% compatible with the Spark version because the Unicode version used by cuDF and the JVM may differ, resulting in some corner-case characters not changing case correctly. project input From a33752769be1c24220649a5451a880dbb8ec62cd Mon Sep 17 00:00:00 2001 From: Firestarman Date: Fri, 2 Jul 2021 13:58:52 +0800 Subject: [PATCH 11/15] less characters. Since tests failed due to some special charaters Signed-off-by: Firestarman --- integration_tests/src/main/python/string_test.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/integration_tests/src/main/python/string_test.py b/integration_tests/src/main/python/string_test.py index a18c1af3af7..1eb21407b5e 100644 --- a/integration_tests/src/main/python/string_test.py +++ b/integration_tests/src/main/python/string_test.py @@ -320,7 +320,7 @@ def test_initcap(): # the charicter set to something more reasonable # upper and lower should cover the corner cases, this is mostly to # see if there are issues with spaces - gen = mk_str_gen('([aAbB1357_@%-ȺéʼnŸ]{0,16}[ \r\n\t]{1,2}){1,5}') + gen = mk_str_gen('([aAbB1357_@%-]{0,12}[ \r\n\t]{1,2}){1,5}') assert_gpu_and_cpu_are_equal_collect( lambda spark: unary_op_df(spark, gen).select( f.initcap(f.col('a')))) From 63d02d4101966f6971bee896222a3e54e5f98678 Mon Sep 17 00:00:00 2001 From: Firestarman Date: Fri, 2 Jul 2021 16:27:40 +0800 Subject: [PATCH 12/15] doc update Signed-off-by: Firestarman --- .../src/main/scala/com/nvidia/spark/rapids/GpuOverrides.scala | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuOverrides.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuOverrides.scala index bdc1574f3da..eafc4599eef 100644 --- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuOverrides.scala +++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuOverrides.scala @@ -1249,7 +1249,9 @@ object GpuOverrides { ExprChecks.unaryProjectNotLambdaInputMatchesOutput(TypeSig.STRING, TypeSig.STRING), (a, conf, p, r) => new UnaryExprMeta[InitCap](a, conf, p, r) { override def convertToGpu(child: Expression): GpuExpression = GpuInitCap(child) - }).incompat(CASE_MODIFICATION_INCOMPAT), + }).incompat("in some cases Unicode characters change byte width when changing the case." + + " The GPU string conversion does not support these characters for capitalize now. For a full" + + " list of unsupported characters see https://github.com/rapidsai/cudf/issues/3132."), expr[Log]( "Natural log", ExprChecks.mathUnary, From f8916ff15df4c651a35f58df79d1ac42253014c9 Mon Sep 17 00:00:00 2001 From: Firestarman Date: Fri, 2 Jul 2021 16:40:28 +0800 Subject: [PATCH 13/15] correct the doc Signed-off-by: Firestarman --- docs/configs.md | 2 +- docs/supported_ops.md | 2 +- integration_tests/src/main/python/string_test.py | 8 ++++++++ .../main/scala/com/nvidia/spark/rapids/GpuOverrides.scala | 4 ++-- 4 files changed, 12 insertions(+), 4 deletions(-) diff --git a/docs/configs.md b/docs/configs.md index 2d37d1b1853..8874c252aa5 100644 --- a/docs/configs.md +++ b/docs/configs.md @@ -197,7 +197,7 @@ Name | SQL Function(s) | Description | Default Value | Notes spark.rapids.sql.expression.If|`if`|IF expression|true|None| spark.rapids.sql.expression.In|`in`|IN operator|true|None| spark.rapids.sql.expression.InSet| |INSET operator|true|None| -spark.rapids.sql.expression.InitCap|`initcap`|Returns str with the first letter of each word in uppercase. All other letters are in lowercase|false|This is not 100% compatible with the Spark version because the Unicode version used by cuDF and the JVM may differ, resulting in some corner-case characters not changing case correctly.| +spark.rapids.sql.expression.InitCap|`initcap`|Returns str with the first letter of each word in uppercase. All other letters are in lowercase|false|This is not 100% compatible with the Spark version because in some cases Unicode characters change byte width when changing the case. The GPU string conversion does not support these characters for capitalize now. For a full list of unsupported characters see https://github.com/rapidsai/cudf/issues/3132.| spark.rapids.sql.expression.InputFileBlockLength|`input_file_block_length`|Returns the length of the block being read, or -1 if not available|true|None| spark.rapids.sql.expression.InputFileBlockStart|`input_file_block_start`|Returns the start offset of the block being read, or -1 if not available|true|None| spark.rapids.sql.expression.InputFileName|`input_file_name`|Returns the name of the file being read, or empty string if not available|true|None| diff --git a/docs/supported_ops.md b/docs/supported_ops.md index 0201f6ccf86..fa14a4f7216 100644 --- a/docs/supported_ops.md +++ b/docs/supported_ops.md @@ -8013,7 +8013,7 @@ Accelerator support is described below. InitCap `initcap` Returns str with the first letter of each word in uppercase. All other letters are in lowercase -This is not 100% compatible with the Spark version because the Unicode version used by cuDF and the JVM may differ, resulting in some corner-case characters not changing case correctly. +This is not 100% compatible with the Spark version because in some cases Unicode characters change byte width when changing the case. The GPU string conversion does not support these characters for capitalize now. For a full list of unsupported characters see https://github.com/rapidsai/cudf/issues/3132. project input diff --git a/integration_tests/src/main/python/string_test.py b/integration_tests/src/main/python/string_test.py index 1eb21407b5e..3824312abe1 100644 --- a/integration_tests/src/main/python/string_test.py +++ b/integration_tests/src/main/python/string_test.py @@ -325,6 +325,14 @@ def test_initcap(): lambda spark: unary_op_df(spark, gen).select( f.initcap(f.col('a')))) +@incompat +@pytest.mark.xfail(reason='https://github.com/rapidsai/cudf/issues/8644') +def test_initcap_width_change(): + gen = mk_str_gen('ʼn([aAbB13ʼnȺéʼnŸ]{0,5}){1,5}') + assert_gpu_and_cpu_are_equal_collect( + lambda spark: unary_op_df(spark, gen).select( + f.initcap(f.col('a')))) + @pytest.mark.xfail(reason='https://github.com/NVIDIA/spark-rapids/issues/119') def test_like_null_xfail(): gen = mk_str_gen('.{0,3}a[|b*.$\r\n]{0,2}c.{0,3}')\ diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuOverrides.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuOverrides.scala index eafc4599eef..c2ecfa01c14 100644 --- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuOverrides.scala +++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuOverrides.scala @@ -1250,8 +1250,8 @@ object GpuOverrides { (a, conf, p, r) => new UnaryExprMeta[InitCap](a, conf, p, r) { override def convertToGpu(child: Expression): GpuExpression = GpuInitCap(child) }).incompat("in some cases Unicode characters change byte width when changing the case." + - " The GPU string conversion does not support these characters for capitalize now. For a full" + - " list of unsupported characters see https://github.com/rapidsai/cudf/issues/3132."), + " The GPU string conversion does not support these characters for capitalize now. For a" + + " full list of unsupported characters see https://github.com/rapidsai/cudf/issues/3132."), expr[Log]( "Natural log", ExprChecks.mathUnary, From 3ac3e815563fa1bbebdbfe0b60447516f2c8cab7 Mon Sep 17 00:00:00 2001 From: Firestarman Date: Fri, 2 Jul 2021 16:49:10 +0800 Subject: [PATCH 14/15] Add isssue link in the doc Signed-off-by: Firestarman --- docs/configs.md | 2 +- docs/supported_ops.md | 2 +- .../src/main/scala/com/nvidia/spark/rapids/GpuOverrides.scala | 3 ++- 3 files changed, 4 insertions(+), 3 deletions(-) diff --git a/docs/configs.md b/docs/configs.md index 8874c252aa5..543c429df62 100644 --- a/docs/configs.md +++ b/docs/configs.md @@ -197,7 +197,7 @@ Name | SQL Function(s) | Description | Default Value | Notes spark.rapids.sql.expression.If|`if`|IF expression|true|None| spark.rapids.sql.expression.In|`in`|IN operator|true|None| spark.rapids.sql.expression.InSet| |INSET operator|true|None| -spark.rapids.sql.expression.InitCap|`initcap`|Returns str with the first letter of each word in uppercase. All other letters are in lowercase|false|This is not 100% compatible with the Spark version because in some cases Unicode characters change byte width when changing the case. The GPU string conversion does not support these characters for capitalize now. For a full list of unsupported characters see https://github.com/rapidsai/cudf/issues/3132.| +spark.rapids.sql.expression.InitCap|`initcap`|Returns str with the first letter of each word in uppercase. All other letters are in lowercase|false|This is not 100% compatible with the Spark version because in some cases Unicode characters change byte width when changing the case. The GPU string conversion does not support these characters for capitalize now. This will be fixed in the future per https://github.com/rapidsai/cudf/issues/8644. For a full list of unsupported characters see https://github.com/rapidsai/cudf/issues/3132.| spark.rapids.sql.expression.InputFileBlockLength|`input_file_block_length`|Returns the length of the block being read, or -1 if not available|true|None| spark.rapids.sql.expression.InputFileBlockStart|`input_file_block_start`|Returns the start offset of the block being read, or -1 if not available|true|None| spark.rapids.sql.expression.InputFileName|`input_file_name`|Returns the name of the file being read, or empty string if not available|true|None| diff --git a/docs/supported_ops.md b/docs/supported_ops.md index fa14a4f7216..f602dc29135 100644 --- a/docs/supported_ops.md +++ b/docs/supported_ops.md @@ -8013,7 +8013,7 @@ Accelerator support is described below. InitCap `initcap` Returns str with the first letter of each word in uppercase. All other letters are in lowercase -This is not 100% compatible with the Spark version because in some cases Unicode characters change byte width when changing the case. The GPU string conversion does not support these characters for capitalize now. For a full list of unsupported characters see https://github.com/rapidsai/cudf/issues/3132. +This is not 100% compatible with the Spark version because in some cases Unicode characters change byte width when changing the case. The GPU string conversion does not support these characters for capitalize now. This will be fixed in the future per https://github.com/rapidsai/cudf/issues/8644. For a full list of unsupported characters see https://github.com/rapidsai/cudf/issues/3132. project input diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuOverrides.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuOverrides.scala index c2ecfa01c14..98ea482d9c4 100644 --- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuOverrides.scala +++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuOverrides.scala @@ -1250,7 +1250,8 @@ object GpuOverrides { (a, conf, p, r) => new UnaryExprMeta[InitCap](a, conf, p, r) { override def convertToGpu(child: Expression): GpuExpression = GpuInitCap(child) }).incompat("in some cases Unicode characters change byte width when changing the case." + - " The GPU string conversion does not support these characters for capitalize now. For a" + + " The GPU string conversion does not support these characters for capitalize now. This" + + " will be fixed in the future per https://github.com/rapidsai/cudf/issues/8644. For a" + " full list of unsupported characters see https://github.com/rapidsai/cudf/issues/3132."), expr[Log]( "Natural log", From 24ef719370ddf8bd33a6dd6d2c3f187b885a8a53 Mon Sep 17 00:00:00 2001 From: Firestarman Date: Mon, 5 Jul 2021 13:20:08 +0800 Subject: [PATCH 15/15] Addressed the new comments. Signed-off-by: Firestarman --- docs/configs.md | 2 +- docs/supported_ops.md | 2 +- integration_tests/src/main/python/string_test.py | 8 ++++---- .../main/scala/com/nvidia/spark/rapids/GpuOverrides.scala | 5 +---- 4 files changed, 7 insertions(+), 10 deletions(-) diff --git a/docs/configs.md b/docs/configs.md index 543c429df62..2d37d1b1853 100644 --- a/docs/configs.md +++ b/docs/configs.md @@ -197,7 +197,7 @@ Name | SQL Function(s) | Description | Default Value | Notes spark.rapids.sql.expression.If|`if`|IF expression|true|None| spark.rapids.sql.expression.In|`in`|IN operator|true|None| spark.rapids.sql.expression.InSet| |INSET operator|true|None| -spark.rapids.sql.expression.InitCap|`initcap`|Returns str with the first letter of each word in uppercase. All other letters are in lowercase|false|This is not 100% compatible with the Spark version because in some cases Unicode characters change byte width when changing the case. The GPU string conversion does not support these characters for capitalize now. This will be fixed in the future per https://github.com/rapidsai/cudf/issues/8644. For a full list of unsupported characters see https://github.com/rapidsai/cudf/issues/3132.| +spark.rapids.sql.expression.InitCap|`initcap`|Returns str with the first letter of each word in uppercase. All other letters are in lowercase|false|This is not 100% compatible with the Spark version because the Unicode version used by cuDF and the JVM may differ, resulting in some corner-case characters not changing case correctly.| spark.rapids.sql.expression.InputFileBlockLength|`input_file_block_length`|Returns the length of the block being read, or -1 if not available|true|None| spark.rapids.sql.expression.InputFileBlockStart|`input_file_block_start`|Returns the start offset of the block being read, or -1 if not available|true|None| spark.rapids.sql.expression.InputFileName|`input_file_name`|Returns the name of the file being read, or empty string if not available|true|None| diff --git a/docs/supported_ops.md b/docs/supported_ops.md index f602dc29135..0201f6ccf86 100644 --- a/docs/supported_ops.md +++ b/docs/supported_ops.md @@ -8013,7 +8013,7 @@ Accelerator support is described below. InitCap `initcap` Returns str with the first letter of each word in uppercase. All other letters are in lowercase -This is not 100% compatible with the Spark version because in some cases Unicode characters change byte width when changing the case. The GPU string conversion does not support these characters for capitalize now. This will be fixed in the future per https://github.com/rapidsai/cudf/issues/8644. For a full list of unsupported characters see https://github.com/rapidsai/cudf/issues/3132. +This is not 100% compatible with the Spark version because the Unicode version used by cuDF and the JVM may differ, resulting in some corner-case characters not changing case correctly. project input diff --git a/integration_tests/src/main/python/string_test.py b/integration_tests/src/main/python/string_test.py index 3824312abe1..7602c48737b 100644 --- a/integration_tests/src/main/python/string_test.py +++ b/integration_tests/src/main/python/string_test.py @@ -320,15 +320,15 @@ def test_initcap(): # the charicter set to something more reasonable # upper and lower should cover the corner cases, this is mostly to # see if there are issues with spaces - gen = mk_str_gen('([aAbB1357_@%-]{0,12}[ \r\n\t]{1,2}){1,5}') + gen = mk_str_gen('([aAbB1357ȺéŸ_@%-]{0,15}[ \r\n\t]{1,2}){1,5}') assert_gpu_and_cpu_are_equal_collect( lambda spark: unary_op_df(spark, gen).select( f.initcap(f.col('a')))) @incompat -@pytest.mark.xfail(reason='https://github.com/rapidsai/cudf/issues/8644') -def test_initcap_width_change(): - gen = mk_str_gen('ʼn([aAbB13ʼnȺéʼnŸ]{0,5}){1,5}') +@pytest.mark.xfail(reason='Spark initcap will not convert ʼn to ʼN') +def test_initcap_special_chars(): + gen = mk_str_gen('ʼn([aAbB13ȺéŸ]{0,5}){1,5}') assert_gpu_and_cpu_are_equal_collect( lambda spark: unary_op_df(spark, gen).select( f.initcap(f.col('a')))) diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuOverrides.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuOverrides.scala index 98ea482d9c4..bdc1574f3da 100644 --- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuOverrides.scala +++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuOverrides.scala @@ -1249,10 +1249,7 @@ object GpuOverrides { ExprChecks.unaryProjectNotLambdaInputMatchesOutput(TypeSig.STRING, TypeSig.STRING), (a, conf, p, r) => new UnaryExprMeta[InitCap](a, conf, p, r) { override def convertToGpu(child: Expression): GpuExpression = GpuInitCap(child) - }).incompat("in some cases Unicode characters change byte width when changing the case." + - " The GPU string conversion does not support these characters for capitalize now. This" + - " will be fixed in the future per https://github.com/rapidsai/cudf/issues/8644. For a" + - " full list of unsupported characters see https://github.com/rapidsai/cudf/issues/3132."), + }).incompat(CASE_MODIFICATION_INCOMPAT), expr[Log]( "Natural log", ExprChecks.mathUnary,