From c9e54cfe20c030a3772d4179c750b4a3358c9ee1 Mon Sep 17 00:00:00 2001 From: Shruti Shivakumar Date: Fri, 8 Mar 2024 13:47:22 -0500 Subject: [PATCH] Improve performance in JSON reader when `mixed_types_as_string` option is enabled (#15236) Addresses #15196 by applying a patch from @karthikeyann to skip the `infer_column_type_kernel` by forcing the mixed types column to be a string. With this optimization, we see a significant improvement in performance. Please refer to the [comment](https://github.com/rapidsai/cudf/pull/15236#issuecomment-1979772672) for a visualization of the results before and after applying this optimization as obtained from the [JSON lines benchmarking exercise](https://github.com/rapidsai/cudf/pull/15124). Authors: - Shruti Shivakumar (https://github.com/shrshi) - Karthikeyan (https://github.com/karthikeyann) Approvers: - Karthikeyan (https://github.com/karthikeyann) - Vukasin Milovanovic (https://github.com/vuule) URL: https://github.com/rapidsai/cudf/pull/15236 --- cpp/src/io/json/json_column.cu | 3 +++ cpp/src/io/json/nested_json.hpp | 2 ++ 2 files changed, 5 insertions(+) diff --git a/cpp/src/io/json/json_column.cu b/cpp/src/io/json/json_column.cu index 10646fad354..6576d41dd72 100644 --- a/cpp/src/io/json/json_column.cu +++ b/cpp/src/io/json/json_column.cu @@ -674,6 +674,7 @@ void make_device_json_column(device_span input, reinitialize_as_string(old_col_id, col); // all its children (which are already inserted) are ignored later. } + col.forced_as_string_column = true; columns.try_emplace(this_col_id, columns.at(old_col_id)); continue; } @@ -915,6 +916,8 @@ std::pair, std::vector> device_json_co : "n/a"); #endif target_type = schema.value().type; + } else if (json_col.forced_as_string_column) { + target_type = data_type{type_id::STRING}; } // Infer column type, if we don't have an explicit type for it else { diff --git a/cpp/src/io/json/nested_json.hpp b/cpp/src/io/json/nested_json.hpp index f41b024bb1e..64fffdb27fc 100644 --- a/cpp/src/io/json/nested_json.hpp +++ b/cpp/src/io/json/nested_json.hpp @@ -160,6 +160,8 @@ struct device_json_column { std::vector column_order; // Counting the current number of items in this column row_offset_t num_rows = 0; + // Force as string column + bool forced_as_string_column{false}; /** * @brief Construct a new d json column object