From ad6fde9354d23108e2a9b8d2c1a77eece878ae17 Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Mon, 29 Jan 2024 07:53:23 -0700 Subject: [PATCH] Support reading JSON data with single quotes around attribute names and values (#10273) * Update some tests to use single quotes * enable single quote normalization * signoff Signed-off-by: Andy Grove * trigger build * copyright years --------- Signed-off-by: Andy Grove --- integration_tests/src/main/python/json_test.py | 2 +- integration_tests/src/test/resources/dates.json | 10 +++++----- .../spark/sql/catalyst/json/rapids/GpuJsonScan.scala | 1 + .../org/apache/spark/sql/rapids/GpuJsonToStructs.scala | 1 + 4 files changed, 8 insertions(+), 6 deletions(-) diff --git a/integration_tests/src/main/python/json_test.py b/integration_tests/src/main/python/json_test.py index 54605455353..78537eb5fb3 100644 --- a/integration_tests/src/main/python/json_test.py +++ b/integration_tests/src/main/python/json_test.py @@ -599,7 +599,7 @@ def test_from_json_map_fallback(): @allow_non_gpu(*non_utc_allow) def test_from_json_struct(schema): # note that column 'a' does not use leading zeroes due to https://github.com/NVIDIA/spark-rapids/issues/9588 - json_string_gen = StringGen(r'{"a": [1-9]{0,5}, "b": "[A-Z]{0,5}", "c": 1\d\d\d}') \ + json_string_gen = StringGen(r'{\'a\': [1-9]{0,5}, "b": \'[A-Z]{0,5}\', "c": 1\d\d\d}') \ .with_special_pattern('', weight=50) \ .with_special_pattern('null', weight=50) assert_gpu_and_cpu_are_equal_collect( diff --git a/integration_tests/src/test/resources/dates.json b/integration_tests/src/test/resources/dates.json index 1fdfc3b4320..e32ff381dd4 100644 --- a/integration_tests/src/test/resources/dates.json +++ b/integration_tests/src/test/resources/dates.json @@ -1,5 +1,5 @@ -{ "number": "2020-09-16" } -{ "number": " 2020-09-16" } -{ "number": "2020-09-16 " } -{ "number": "1581-01-01" } -{ "number": "1583-01-01" } \ No newline at end of file +{ 'number': '2020-09-16' } +{ 'number': ' 2020-09-16' } +{ 'number': '2020-09-16 ' } +{ 'number': '1581-01-01' } +{ 'number': '1583-01-01' } \ No newline at end of file diff --git a/sql-plugin/src/main/scala/org/apache/spark/sql/catalyst/json/rapids/GpuJsonScan.scala b/sql-plugin/src/main/scala/org/apache/spark/sql/catalyst/json/rapids/GpuJsonScan.scala index 138f99b0c72..c32838d903d 100644 --- a/sql-plugin/src/main/scala/org/apache/spark/sql/catalyst/json/rapids/GpuJsonScan.scala +++ b/sql-plugin/src/main/scala/org/apache/spark/sql/catalyst/json/rapids/GpuJsonScan.scala @@ -359,6 +359,7 @@ class JsonPartitionReader( cudf.JSONOptions.builder() .withRecoverWithNull(true) .withMixedTypesAsStrings(enableMixedTypesAsString) + .withNormalizeSingleQuotes(true) .build } diff --git a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/GpuJsonToStructs.scala b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/GpuJsonToStructs.scala index 7d85628636e..4436f6b9716 100644 --- a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/GpuJsonToStructs.scala +++ b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/GpuJsonToStructs.scala @@ -183,6 +183,7 @@ case class GpuJsonToStructs( val jsonOptions = cudf.JSONOptions.builder() .withRecoverWithNull(true) .withMixedTypesAsStrings(enableMixedTypesAsString) + .withNormalizeSingleQuotes(true) .build() withResource(cudf.Table.readJSON(jsonOptions, data, start, length)) { tableWithMeta => val names = tableWithMeta.getColumnNames