NVIDIA · jlowe · Jul 7, 2021 · Jun 29, 2021 · Jun 29, 2021 · Jun 29, 2021
diff --git a/docs/configs.md b/docs/configs.md
@@ -197,7 +197,7 @@ Name | SQL Function(s) | Description | Default Value | Notes
 <a name="sql.expression.If"></a>spark.rapids.sql.expression.If|`if`|IF expression|true|None|
 <a name="sql.expression.In"></a>spark.rapids.sql.expression.In|`in`|IN operator|true|None|
 <a name="sql.expression.InSet"></a>spark.rapids.sql.expression.InSet| |INSET operator|true|None|
-<a name="sql.expression.InitCap"></a>spark.rapids.sql.expression.InitCap|`initcap`|Returns str with the first letter of each word in uppercase. All other letters are in lowercase|false|This is not 100% compatible with the Spark version because in some cases unicode characters change byte width when changing the case. The GPU string conversion does not support these characters. For a full list of unsupported characters see https://github.com/rapidsai/cudf/issues/3132 Spark also only sees the space character as a word deliminator, but this will capitalize any character after a non-alphabetic character.  The behavior will be aligned to match Spark in the future per https://github.com/NVIDIA/spark-rapids/issues/2786.|
+<a name="sql.expression.InitCap"></a>spark.rapids.sql.expression.InitCap|`initcap`|Returns str with the first letter of each word in uppercase. All other letters are in lowercase|false|This is not 100% compatible with the Spark version because in some cases Unicode characters change byte width when changing the case. The GPU string conversion does not support these characters for capitalize now. This will be fixed in the future per https://github.com/rapidsai/cudf/issues/8644. For a full list of unsupported characters see https://github.com/rapidsai/cudf/issues/3132.|
 <a name="sql.expression.InputFileBlockLength"></a>spark.rapids.sql.expression.InputFileBlockLength|`input_file_block_length`|Returns the length of the block being read, or -1 if not available|true|None|
 <a name="sql.expression.InputFileBlockStart"></a>spark.rapids.sql.expression.InputFileBlockStart|`input_file_block_start`|Returns the start offset of the block being read, or -1 if not available|true|None|
 <a name="sql.expression.InputFileName"></a>spark.rapids.sql.expression.InputFileName|`input_file_name`|Returns the name of the file being read, or empty string if not available|true|None|
@@ -221,7 +221,7 @@ Name | SQL Function(s) | Description | Default Value | Notes
 <a name="sql.expression.Log1p"></a>spark.rapids.sql.expression.Log1p|`log1p`|Natural log 1 + expr|true|None|
 <a name="sql.expression.Log2"></a>spark.rapids.sql.expression.Log2|`log2`|Log base 2|true|None|
 <a name="sql.expression.Logarithm"></a>spark.rapids.sql.expression.Logarithm|`log`|Log variable base|true|None|
-<a name="sql.expression.Lower"></a>spark.rapids.sql.expression.Lower|`lower`, `lcase`|String lowercase operator|false|This is not 100% compatible with the Spark version because in some cases unicode characters change byte width when changing the case. The GPU string conversion does not support these characters. For a full list of unsupported characters see https://github.com/rapidsai/cudf/issues/3132|
+<a name="sql.expression.Lower"></a>spark.rapids.sql.expression.Lower|`lower`, `lcase`|String lowercase operator|false|This is not 100% compatible with the Spark version because the Unicode version used by cuDF and the JVM may differ, resulting in some corner-case characters not changing case correctly.|
 <a name="sql.expression.MakeDecimal"></a>spark.rapids.sql.expression.MakeDecimal| |Create a Decimal from an unscaled long value for some aggregation optimizations|true|None|
 <a name="sql.expression.Md5"></a>spark.rapids.sql.expression.Md5|`md5`|MD5 hash operator|true|None|
 <a name="sql.expression.Minute"></a>spark.rapids.sql.expression.Minute|`minute`|Returns the minute component of the string/timestamp|true|None|
@@ -283,7 +283,7 @@ Name | SQL Function(s) | Description | Default Value | Notes
 <a name="sql.expression.UnboundedPreceding$"></a>spark.rapids.sql.expression.UnboundedPreceding$| |Special boundary for a window frame, indicating all rows preceding the current row|true|None|
 <a name="sql.expression.UnixTimestamp"></a>spark.rapids.sql.expression.UnixTimestamp|`unix_timestamp`|Returns the UNIX timestamp of current or specified time|true|None|
 <a name="sql.expression.UnscaledValue"></a>spark.rapids.sql.expression.UnscaledValue| |Convert a Decimal to an unscaled long value for some aggregation optimizations|true|None|
-<a name="sql.expression.Upper"></a>spark.rapids.sql.expression.Upper|`upper`, `ucase`|String uppercase operator|false|This is not 100% compatible with the Spark version because in some cases unicode characters change byte width when changing the case. The GPU string conversion does not support these characters. For a full list of unsupported characters see https://github.com/rapidsai/cudf/issues/3132|
+<a name="sql.expression.Upper"></a>spark.rapids.sql.expression.Upper|`upper`, `ucase`|String uppercase operator|false|This is not 100% compatible with the Spark version because the Unicode version used by cuDF and the JVM may differ, resulting in some corner-case characters not changing case correctly.|
 <a name="sql.expression.WeekDay"></a>spark.rapids.sql.expression.WeekDay|`weekday`|Returns the day of the week (0 = Monday...6=Sunday)|true|None|
 <a name="sql.expression.WindowExpression"></a>spark.rapids.sql.expression.WindowExpression| |Calculates a return value for every input row of a table based on a group (or "window") of rows|true|None|
 <a name="sql.expression.WindowSpecDefinition"></a>spark.rapids.sql.expression.WindowSpecDefinition| |Specification of a window function, indicating the partitioning-expression, the row ordering, and the width of the window|true|None|

diff --git a/docs/supported_ops.md b/docs/supported_ops.md
@@ -8013,7 +8013,7 @@ Accelerator support is described below.
 <td rowSpan="4">InitCap</td>
 <td rowSpan="4">`initcap`</td>
 <td rowSpan="4">Returns str with the first letter of each word in uppercase. All other letters are in lowercase</td>
-<td rowSpan="4">This is not 100% compatible with the Spark version because in some cases unicode characters change byte width when changing the case. The GPU string conversion does not support these characters. For a full list of unsupported characters see https://github.com/rapidsai/cudf/issues/3132 Spark also only sees the space character as a word deliminator, but this will capitalize any character after a non-alphabetic character.  The behavior will be aligned to match Spark in the future per https://github.com/NVIDIA/spark-rapids/issues/2786.</td>
+<td rowSpan="4">This is not 100% compatible with the Spark version because in some cases Unicode characters change byte width when changing the case. The GPU string conversion does not support these characters for capitalize now. This will be fixed in the future per https://github.com/rapidsai/cudf/issues/8644. For a full list of unsupported characters see https://github.com/rapidsai/cudf/issues/3132.</td>
 <td rowSpan="2">project</td>
 <td>input</td>
 <td> </td>
@@ -10343,7 +10343,7 @@ Accelerator support is described below.
 <td rowSpan="4">Lower</td>
 <td rowSpan="4">`lower`, `lcase`</td>
 <td rowSpan="4">String lowercase operator</td>
-<td rowSpan="4">This is not 100% compatible with the Spark version because in some cases unicode characters change byte width when changing the case. The GPU string conversion does not support these characters. For a full list of unsupported characters see https://github.com/rapidsai/cudf/issues/3132</td>
+<td rowSpan="4">This is not 100% compatible with the Spark version because the Unicode version used by cuDF and the JVM may differ, resulting in some corner-case characters not changing case correctly.</td>
 <td rowSpan="2">project</td>
 <td>input</td>
 <td> </td>
@@ -17519,7 +17519,7 @@ Accelerator support is described below.
 <td rowSpan="4">Upper</td>
 <td rowSpan="4">`upper`, `ucase`</td>
 <td rowSpan="4">String uppercase operator</td>
-<td rowSpan="4">This is not 100% compatible with the Spark version because in some cases unicode characters change byte width when changing the case. The GPU string conversion does not support these characters. For a full list of unsupported characters see https://github.com/rapidsai/cudf/issues/3132</td>
+<td rowSpan="4">This is not 100% compatible with the Spark version because the Unicode version used by cuDF and the JVM may differ, resulting in some corner-case characters not changing case correctly.</td>
 <td rowSpan="2">project</td>
 <td>input</td>
 <td> </td>

diff --git a/integration_tests/src/main/python/string_test.py b/integration_tests/src/main/python/string_test.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2021, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -314,23 +314,21 @@ def test_length():
                 'CHAR_LENGTH(a)',
                 'CHARACTER_LENGTH(a)'))
 
-# Once the xfail is fixed this can replace test_initcap_space
 @incompat
-@pytest.mark.xfail(reason='https://github.com/NVIDIA/spark-rapids/issues/120')
 def test_initcap():
     # Because we don't use the same unicode version we need to limit
     # the charicter set to something more reasonable
     # upper and lower should cover the corner cases, this is mostly to
     # see if there are issues with spaces
-    gen = mk_str_gen('([aAbB]{0,5}[ \r\n\t]{1,2}){1,5}')
+    gen = mk_str_gen('([aAbB1357_@%-]{0,12}[ \r\n\t]{1,2}){1,5}')
     assert_gpu_and_cpu_are_equal_collect(
             lambda spark: unary_op_df(spark, gen).select(
                 f.initcap(f.col('a'))))
 
 @incompat
-def test_initcap_space():
-    # we see a lot more space delim
-    gen = StringGen('([aAbB]{0,5}[ ]{1,2}){1,5}')
+@pytest.mark.xfail(reason='https://github.com/rapidsai/cudf/issues/8644')
+def test_initcap_width_change():
+    gen = mk_str_gen('ŉ([aAbB13ŉȺéŉŸ]{0,5}){1,5}')
     assert_gpu_and_cpu_are_equal_collect(
             lambda spark: unary_op_df(spark, gen).select(
                 f.initcap(f.col('a'))))

diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuOverrides.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuOverrides.scala
@@ -433,9 +433,8 @@ object GpuOverrides {
     "when enabling these, there may be extra groups produced for floating point grouping " +
     "keys (e.g. -0.0, and 0.0)"
   val CASE_MODIFICATION_INCOMPAT =
-    "in some cases unicode characters change byte width when changing the case. The GPU string " +
-    "conversion does not support these characters. For a full list of unsupported characters " +
-    "see https://github.com/rapidsai/cudf/issues/3132"
+    "the Unicode version used by cuDF and the JVM may differ, resulting in some " +
+    "corner-case characters not changing case correctly."
   val UTC_TIMEZONE_ID = ZoneId.of("UTC").normalized()
   // Based on https://docs.oracle.com/javase/8/docs/api/java/util/regex/Pattern.html
   private[this] lazy val regexList: Seq[String] = Seq("\\", "\u0000", "\\x", "\t", "\n", "\r",
@@ -1250,10 +1249,10 @@ object GpuOverrides {
       ExprChecks.unaryProjectNotLambdaInputMatchesOutput(TypeSig.STRING, TypeSig.STRING),
       (a, conf, p, r) => new UnaryExprMeta[InitCap](a, conf, p, r) {
         override def convertToGpu(child: Expression): GpuExpression = GpuInitCap(child)
-      }).incompat(CASE_MODIFICATION_INCOMPAT + " Spark also only sees the space character as " +
-      "a word deliminator, but this will capitalize any character after a non-alphabetic " + 
-      "character.  The behavior will be aligned to match Spark in the future per " +
-      "https://github.com/NVIDIA/spark-rapids/issues/2786."),
+      }).incompat("in some cases Unicode characters change byte width when changing the case." +
+      " The GPU string conversion does not support these characters for capitalize now. This" +
+      " will be fixed in the future per https://github.com/rapidsai/cudf/issues/8644. For a" +
+      " full list of unsupported characters see https://github.com/rapidsai/cudf/issues/3132."),
     expr[Log](
       "Natural log",
       ExprChecks.mathUnary,

diff --git a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/stringFunctions.scala b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/stringFunctions.scala
@@ -481,7 +481,10 @@ case class GpuInitCap(child: Expression) extends GpuUnaryExpression with Implici
   override def inputTypes: Seq[DataType] = Seq(StringType)
   override def dataType: DataType = StringType
   override protected def doColumnar(input: GpuColumnVector): ColumnVector =
-    input.getBase.toTitle
+    withResource(Scalar.fromString(" ")) { space =>
+      // Spark only sees the space character as a word deliminator.
+      input.getBase.capitalize(space)
+    }
 }
 
 case class GpuStringReplace(