Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Replace toTitle with capitalize for GpuInitCap #2838

Merged
merged 15 commits into from
Jul 7, 2021
15 changes: 2 additions & 13 deletions integration_tests/src/main/python/string_test.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (c) 2020, NVIDIA CORPORATION.
# Copyright (c) 2020-2021, NVIDIA CORPORATION.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -314,23 +314,12 @@ def test_length():
'CHAR_LENGTH(a)',
'CHARACTER_LENGTH(a)'))

# Once the xfail is fixed this can replace test_initcap_space
@incompat
@pytest.mark.xfail(reason='https://github.com/NVIDIA/spark-rapids/issues/120')
def test_initcap():
jlowe marked this conversation as resolved.
Show resolved Hide resolved
# Because we don't use the same unicode version we need to limit
# the charicter set to something more reasonable
# upper and lower should cover the corner cases, this is mostly to
# see if there are issues with spaces
gen = mk_str_gen('([aAbB]{0,5}[ \r\n\t]{1,2}){1,5}')
assert_gpu_and_cpu_are_equal_collect(
lambda spark: unary_op_df(spark, gen).select(
f.initcap(f.col('a'))))

@incompat
def test_initcap_space():
# we see a lot more space delim
gen = StringGen('([aAbB]{0,5}[ ]{1,2}){1,5}')
gen = mk_str_gen('([aAbB1357_@%-ȺéʼnŸ]{0,16}[ \r\n\t]{1,2}){1,5}')
jlowe marked this conversation as resolved.
Show resolved Hide resolved
assert_gpu_and_cpu_are_equal_collect(
lambda spark: unary_op_df(spark, gen).select(
f.initcap(f.col('a'))))
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -432,10 +432,6 @@ object GpuOverrides {
val FLOAT_DIFFERS_GROUP_INCOMPAT =
"when enabling these, there may be extra groups produced for floating point grouping " +
"keys (e.g. -0.0, and 0.0)"
val CASE_MODIFICATION_INCOMPAT =
"in some cases unicode characters change byte width when changing the case. The GPU string " +
"conversion does not support these characters. For a full list of unsupported characters " +
"see https://github.com/rapidsai/cudf/issues/3132"
val UTC_TIMEZONE_ID = ZoneId.of("UTC").normalized()
// Based on https://docs.oracle.com/javase/8/docs/api/java/util/regex/Pattern.html
private[this] lazy val regexList: Seq[String] = Seq("\\", "\u0000", "\\x", "\t", "\n", "\r",
Expand Down Expand Up @@ -1250,10 +1246,7 @@ object GpuOverrides {
ExprChecks.unaryProjectNotLambdaInputMatchesOutput(TypeSig.STRING, TypeSig.STRING),
(a, conf, p, r) => new UnaryExprMeta[InitCap](a, conf, p, r) {
override def convertToGpu(child: Expression): GpuExpression = GpuInitCap(child)
}).incompat(CASE_MODIFICATION_INCOMPAT + " Spark also only sees the space character as " +
"a word deliminator, but this will capitalize any character after a non-alphabetic " +
"character. The behavior will be aligned to match Spark in the future per " +
"https://github.com/NVIDIA/spark-rapids/issues/2786."),
}),
expr[Log](
"Natural log",
ExprChecks.mathUnary,
Expand Down Expand Up @@ -2201,15 +2194,13 @@ object GpuOverrides {
ExprChecks.unaryProjectNotLambdaInputMatchesOutput(TypeSig.STRING, TypeSig.STRING),
(a, conf, p, r) => new UnaryExprMeta[Upper](a, conf, p, r) {
override def convertToGpu(child: Expression): GpuExpression = GpuUpper(child)
})
.incompat(CASE_MODIFICATION_INCOMPAT),
jlowe marked this conversation as resolved.
Show resolved Hide resolved
}),
expr[Lower](
"String lowercase operator",
ExprChecks.unaryProjectNotLambdaInputMatchesOutput(TypeSig.STRING, TypeSig.STRING),
(a, conf, p, r) => new UnaryExprMeta[Lower](a, conf, p, r) {
override def convertToGpu(child: Expression): GpuExpression = GpuLower(child)
})
.incompat(CASE_MODIFICATION_INCOMPAT),
}),
expr[StringLPad](
"Pad a string on the left",
ExprChecks.projectNotLambda(TypeSig.STRING, TypeSig.STRING,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -481,7 +481,10 @@ case class GpuInitCap(child: Expression) extends GpuUnaryExpression with Implici
override def inputTypes: Seq[DataType] = Seq(StringType)
override def dataType: DataType = StringType
override protected def doColumnar(input: GpuColumnVector): ColumnVector =
input.getBase.toTitle
withResource(Scalar.fromString(" ")) { space =>
// Spark only sees the space character as a word deliminator.
input.getBase.capitalize(space)
}
}

case class GpuStringReplace(
Expand Down