Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Use parser from spark to normalize json path in GetJsonObject #10466

Merged
17 changes: 15 additions & 2 deletions integration_tests/src/main/python/get_json_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -257,27 +257,40 @@ def test_get_json_object_unquoted_array_notation():
conf={'spark.rapids.sql.expression.GetJsonObject': 'true'})


@pytest.mark.xfail(reason="https://github.com/NVIDIA/spark-rapids/issues/10215")
def test_get_json_object_white_space_removal():
# This is a special version of invalid path. It is something that the GPU supports
# but the CPU thinks is invalid
schema = StructType([StructField("jsonStr", StringType())])
data = [['{" a":" A"," b":" B"}'],
['{"a":"A","b":"B"}'],
['{"a ":"A ","b ":"B "}'],
['{" a ":" A "," b ":" B "}']]
['{" a ":" A "," b ":" B "}'],
['{" a ": {" a ":" A "}," b ": " B "}'],
['{" a":"b","a.a":"c","b":{"a":"ab"}}'],
['{" a":"b"," a. a":"c","b":{"a":"ab"}}'],
['{" a":"b","a .a ":"c","b":{"a":"ab"}}'],
['{" a":"b"," a . a ":"c","b":{"a":"ab"}}'],]

assert_gpu_and_cpu_are_equal_collect(
lambda spark: spark.createDataFrame(data,schema=schema).select(
f.col('jsonStr'),
f.get_json_object('jsonStr', '$.a').alias('dot_a'),
f.get_json_object('jsonStr', '$. a').alias('dot_space_a'),
f.get_json_object('jsonStr', '$.\ta').alias('dot_tab_a'),
f.get_json_object('jsonStr', '$. a').alias('dot_spaces_a3'),
f.get_json_object('jsonStr', '$. a').alias('dot_space_a'),
f.get_json_object('jsonStr', '$.a ').alias('dot_a_space'),
f.get_json_object('jsonStr', '$. a ').alias('dot_space_a_space'),
f.get_json_object('jsonStr', "$['b']").alias('dot_b'),
f.get_json_object('jsonStr', "$[' b']").alias('dot_space_b'),
f.get_json_object('jsonStr', "$['b ']").alias('dot_b_space'),
f.get_json_object('jsonStr', "$[' b ']").alias('dot_space_b_space'),
f.get_json_object('jsonStr', "$. a. a").alias('dot_space_a_dot_space_a'),
f.get_json_object('jsonStr', "$.a .a ").alias('dot_a_space_dot_a_space'),
f.get_json_object('jsonStr', "$. a . a ").alias('dot_space_a_space_dot_space_a_space'),
f.get_json_object('jsonStr', "$[' a. a']").alias('space_a_dot_space_a'),
f.get_json_object('jsonStr', "$['a .a ']").alias('a_space_dot_a_space'),
f.get_json_object('jsonStr', "$[' a . a ']").alias('space_a_space_dot_space_a_space'),
),
conf={'spark.rapids.sql.expression.GetJsonObject': 'true'})

Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2021-2023, NVIDIA CORPORATION.
* Copyright (c) 2021-2024, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand All @@ -16,7 +16,7 @@

package com.nvidia.spark.rapids

import ai.rapids.cudf.{ColumnVector,GetJsonObjectOptions}
import ai.rapids.cudf.{ColumnVector, GetJsonObjectOptions, Scalar}
import com.nvidia.spark.rapids.Arm.withResource

import org.apache.spark.sql.catalyst.expressions.{ExpectsInputTypes, Expression}
Expand All @@ -32,8 +32,20 @@ case class GpuGetJsonObject(json: Expression, path: Expression)
override def nullable: Boolean = true
override def prettyName: String = "get_json_object"

override def doColumnar(lhs: GpuColumnVector, rhs: GpuScalar): ColumnVector = {
lhs.getBase().getJSONObject(rhs.getBase,
def normalizeJsonPath(path: GpuScalar): Scalar = {
val pathStr = if (path.isValid) {
path.getValue.toString()
} else {
throw new IllegalArgumentException("Invalid path")
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't know how big of a deal this is, but it means that the path is null. We should not throw an exception here, but we should return None as a null path should result in a null for all resulting values instead of an exception.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

done

}
// Remove all leading whitespaces before names, so we erase all whitespaces after . or ['
val normalizedPath = pathStr.replaceAll("""\.(\s+)""", ".").replaceAll("""\['(\s+)""", "['")
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't like us using regular expressions here. It might work, but I don't see it as a good long term solution. Could we please take the code that Spark uses to parse a JSON path.

https://github.com/apache/spark/blob/78f7c30e140fd8cf4a80b783dd7e9ee4d1b4d7e2/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/jsonExpressions.scala#L59

And modify it so that we output a normalized JSON path instead. That would also give us the ability to detect errors in the path before we send it to CUDF and make sure that we are compatible with what CUDF supports.

Meaning we really just reuse their code and then convert the Option[List[PathInstruction]] back to a string.

parsedPath match {
  case None => // ERROR return a column of nulls
  case Some(lp) =>
    val ret = new StringBuffer()
    ret.append('$')
    lp.foreach {
      case Subscript | Key => // Ignored we don't care about these
      case Wildcard => ret.append("[*]")
      case Named(name) => ret.append(s"['${name}']")
      case Index(idx) => ret.append(s"[$idx]")
      case _ => ERROR!!!!
    }
}

Copy link
Collaborator Author

@thirtiseven thirtiseven Feb 23, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks! Updated to use JsonPathParser, this approach also looks to fix some other issues related to json path.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Updated the PR description to explain why this PR can fix these issues.

Scalar.fromString(normalizedPath)
}

override def doColumnar(lhs: GpuColumnVector, rhs: GpuScalar): ColumnVector = {
val normalizedScalar = normalizeJsonPath(rhs)
lhs.getBase().getJSONObject(normalizedScalar,
GetJsonObjectOptions.builder().allowSingleQuotes(true).build());
}

Expand Down
Loading