Skip to content

Commit

Permalink
Enable mixed types as string in GpuJsonScan (#9993)
Browse files Browse the repository at this point in the history
* Enable mixed types as string when reading JSON

Signed-off-by: Andy Grove <[email protected]>

* mixed type test passes

* test for mixed primitive arrays

* enable another test

* trigger build

* disable mixed type as string in GpuJsonToStruct

---------

Signed-off-by: Andy Grove <[email protected]>
  • Loading branch information
andygrove authored Jan 24, 2024
1 parent 35f64fc commit 53c0f33
Show file tree
Hide file tree
Showing 5 changed files with 18 additions and 4 deletions.
4 changes: 3 additions & 1 deletion integration_tests/src/main/python/json_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -372,7 +372,9 @@ def test_read_invalid_json(spark_tmp_table_factory, std_input_path, read_func, f
'mixed-primitives.ndjson',
'mixed-primitives-nested.ndjson',
'simple-nested.ndjson',
pytest.param('mixed-nested.ndjson', marks=pytest.mark.xfail(reason='https://github.com/NVIDIA/spark-rapids/issues/9353'))
'mixed-nested.ndjson',
'mixed-types-in-struct.ndjson',
'mixed-primitive-arrays.ndjson',
])
@pytest.mark.parametrize('read_func', [read_json_df, read_json_sql])
@pytest.mark.parametrize('schema', [_int_schema])
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
{ "var0": [1, 2, 3] }
{ "var0": [false, false, true] }
{ "var0": ["one", "two", "three"] }
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
{ "var0": [1, 2, 3] }
{ "var0": { "a": 54321} }
Original file line number Diff line number Diff line change
Expand Up @@ -352,8 +352,10 @@ class JsonPartitionReader(
maxBytesPerChunk, execMetrics, HostLineBuffererFactory) {

def buildJsonOptions(parsedOptions: JSONOptions): cudf.JSONOptions = {
val builder = cudf.JSONOptions.builder().withRecoverWithNull(true)
builder.build
cudf.JSONOptions.builder()
.withRecoverWithNull(true)
.withMixedTypesAsStrings(true)
.build
}

/**
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -175,7 +175,12 @@ case class GpuJsonToStructs(
val end = combinedHost.getEndListOffset(0)
val length = end - start

val jsonOptions = cudf.JSONOptions.builder().withRecoverWithNull(true).build()
val jsonOptions = cudf.JSONOptions.builder()
.withRecoverWithNull(true)
// tracking issue for enabling mixed type as string
// https://github.com/NVIDIA/spark-rapids/issues/10253
.withMixedTypesAsStrings(false)
.build()
withResource(cudf.Table.readJSON(jsonOptions, data, start, length)) { tableWithMeta =>
val names = tableWithMeta.getColumnNames
(names, tableWithMeta.releaseTable())
Expand Down

0 comments on commit 53c0f33

Please sign in to comment.