rapidsai · rapids-bot · Mar 11, 2024 · Jan 30, 2024 · Jan 31, 2024 · Feb 1, 2024
@@ -382,6 +382,7 @@ add_library(
   src/io/json/read_json.cu
   src/io/json/legacy/json_gpu.cu
   src/io/json/legacy/reader_impl.cu
+  src/io/json/parser_features.cpp
   src/io/json/write_json.cu
   src/io/orc/aggregate_orc_metadata.cpp
   src/io/orc/dict_enc.cu

@@ -333,6 +333,7 @@ class json_reader_options {
 
   /**
    * @brief Set whether to parse mixed types as a string column.
+   * Also enables forcing to read a struct as string column using schema.
    *
    * @param val Boolean value to enable/disable parsing mixed types as a string column
    */
@@ -491,6 +492,7 @@ class json_reader_options_builder {
 
   /**
    * @brief Set whether to parse mixed types as a string column.
+   * Also enables forcing to read a struct as string column using schema.
    *
    * @param val Boolean value to enable/disable parsing mixed types as a string column
    * @return this for chaining

@@ -496,15 +496,16 @@ void make_device_json_column(device_span<SymbolT const> input,
     rmm::exec_policy(stream), sorted_col_ids.begin(), sorted_col_ids.end(), node_ids.begin());
 
   NodeIndexT const row_array_parent_col_id = [&]() {
-    if (!is_array_of_arrays) return parent_node_sentinel;
-    auto const list_node_index = is_enabled_lines ? 0 : 1;
-    NodeIndexT value;
-    CUDF_CUDA_TRY(cudaMemcpyAsync(&value,
-                                  col_ids.data() + list_node_index,
-                                  sizeof(NodeIndexT),
-                                  cudaMemcpyDefault,
-                                  stream.value()));
-    stream.synchronize();
+    NodeIndexT value = parent_node_sentinel;
+    if (!col_ids.empty()) {
+      auto const list_node_index = is_enabled_lines ? 0 : 1;
+      CUDF_CUDA_TRY(cudaMemcpyAsync(&value,
+                                    col_ids.data() + list_node_index,
+                                    sizeof(NodeIndexT),
+                                    cudaMemcpyDefault,
+                                    stream.value()));
+      stream.synchronize();
+    }
     return value;
   }();
 
@@ -592,6 +593,12 @@ void make_device_json_column(device_span<SymbolT const> input,
     col.column_order.clear();
   };
 
+  path_from_tree tree_path{column_categories,
+                           column_parent_ids,
+                           column_names,
+                           is_array_of_arrays,
+                           row_array_parent_col_id};
+
   // 2. generate nested columns tree and its device_memory
   // reorder unique_col_ids w.r.t. column_range_begin for order of column to be in field order.
   auto h_range_col_id_it =
@@ -642,6 +649,7 @@ void make_device_json_column(device_span<SymbolT const> input,
       ignore_vals[this_col_id]          = 1;
       continue;
     }
+
     // If the child is already found,
     // replace if this column is a nested column and the existing was a value column
     // ignore this column if this column is a value column and the existing was a nested column
@@ -700,6 +708,17 @@ void make_device_json_column(device_span<SymbolT const> input,
                      "A mix of lists and structs within the same column is not supported");
       }
     }
+    if (is_enabled_mixed_types_as_string) {
+      // get path of this column, check if it is a struct forced as string, and enforce it
+      auto nt                          = tree_path.get_path(this_col_id);
+      std::optional<data_type> user_dt = get_path_data_type(nt, options);
+      if (column_categories[this_col_id] == NC_STRUCT and user_dt.has_value() and
+          user_dt.value().id() == type_id::STRING) {
+        is_mixed_type_column[this_col_id] = 1;
+        column_categories[this_col_id]    = NC_STR;
+      }
+    }
+
     CUDF_EXPECTS(parent_col.child_columns.count(name) == 0, "duplicate column name: " + name);
     // move into parent
     device_json_column col(stream, mr);

@@ -307,6 +307,32 @@ table_with_metadata device_parse_nested_json(device_span<SymbolT const> input,
                                              rmm::cuda_stream_view stream,
                                              rmm::mr::device_memory_resource* mr);
 
+/**
+ * @brief Get the path data type of a column by path if present in input schema
+ *
+ * @param path path of the column
+ * @param options json reader options which holds schema
+ * @return data type of the column if present
+ */
+std::optional<data_type> get_path_data_type(
+  host_span<std::pair<std::string, cudf::io::json::NodeT>> path,
+  cudf::io::json_reader_options const& options);
+
+/**
+ * @brief Helper class to get path of a column by column id from reduced column tree
+ *
+ */
+struct path_from_tree {
+  host_span<NodeT const> column_categories;
+  host_span<NodeIndexT const> column_parent_ids;
+  host_span<std::string const> column_names;
+  bool is_array_of_arrays;
+  NodeIndexT const row_array_parent_col_id;
+
+  using path_rep = std::pair<std::string, cudf::io::json::NodeT>;
+  std::vector<path_rep> get_path(NodeIndexT this_col_id);
+};
+
 /**
  * @brief Parses the given JSON string and generates table from the given input.
  *

@@ -0,0 +1,126 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "nested_json.hpp"
+
+#include <cudf/detail/utilities/visitor_overload.hpp>
+
+#include <optional>
+#include <string>
+#include <vector>
+
+namespace cudf::io::json::detail {
+
+std::optional<schema_element> child_schema_element(std::string const& col_name,
+                                                   cudf::io::json_reader_options const& options)
+{
+  return std::visit(
+    cudf::detail::visitor_overload{
+      [col_name](std::vector<data_type> const& user_dtypes) -> std::optional<schema_element> {
+        auto column_index = atol(col_name.data());
+        return (static_cast<std::size_t>(column_index) < user_dtypes.size())
+                 ? std::optional<schema_element>{{user_dtypes[column_index]}}
+                 : std::optional<schema_element>{};
+      },
+      [col_name](
+        std::map<std::string, data_type> const& user_dtypes) -> std::optional<schema_element> {
+        return (user_dtypes.find(col_name) != std::end(user_dtypes))
+                 ? std::optional<schema_element>{{user_dtypes.find(col_name)->second}}
+                 : std::optional<schema_element>{};
+      },
+      [col_name](
+        std::map<std::string, schema_element> const& user_dtypes) -> std::optional<schema_element> {
+        return (user_dtypes.find(col_name) != std::end(user_dtypes))
+                 ? user_dtypes.find(col_name)->second
+                 : std::optional<schema_element>{};
+      }},
+    options.get_dtypes());
+}
+
+// example schema and its path.
+// "a": int             {"a", int}
+// "a": [ int ]         {"a", list}, {"element", int}
+// "a": { "b": int}     {"a", struct}, {"b", int}
+// "a": [ {"b": int }]  {"a", list}, {"element", struct}, {"b", int}
+// "a": [ null]         {"a", list}, {"element", str}
+// back() is root.
+// front() is leaf.
+std::optional<data_type> get_path_data_type(
+  host_span<std::pair<std::string, cudf::io::json::NodeT>> path, schema_element const& root)
+{
+  if (path.empty() || path.size() == 1) {
+    return root.type;
+  } else {
+    if (path.back().second == NC_STRUCT && root.type.id() == type_id::STRUCT) {
+      auto const child_name      = path.first(path.size() - 1).back().first;
+      auto const child_schema_it = root.child_types.find(child_name);
+      return (child_schema_it != std::end(root.child_types))
+               ? get_path_data_type(path.first(path.size() - 1), child_schema_it->second)
+               : std::optional<data_type>{};
+    } else if (path.back().second == NC_LIST && root.type.id() == type_id::LIST) {
+      auto const child_schema_it = root.child_types.find(list_child_name);
+      return (child_schema_it != std::end(root.child_types))
+               ? get_path_data_type(path.first(path.size() - 1), child_schema_it->second)
+               : std::optional<data_type>{};
+    }
+    return std::optional<data_type>{};
+  }
+}
+
+std::optional<data_type> get_path_data_type(
+  host_span<std::pair<std::string, cudf::io::json::NodeT>> path,
+  cudf::io::json_reader_options const& options)
+{
+  if (path.empty()) return {};
+  std::optional<schema_element> col_schema = child_schema_element(path.back().first, options);
+  // check if it has value, then do recursive call and return.
+  if (col_schema.has_value()) {
+    return get_path_data_type(path, col_schema.value());
+  } else {
+    return {};
+  }
+}
+
+// idea: write a memoizer using template and lambda?, then call recursively.
+std::vector<path_from_tree::path_rep> path_from_tree::get_path(NodeIndexT this_col_id)
+{
+  std::vector<path_rep> path;
+  // TODO Need to stop at row root. so, how to find row root?
+  while (this_col_id != parent_node_sentinel) {
+    auto type        = column_categories[this_col_id];
+    std::string name = "";
+    // TODO make this ifelse into a separate lambda function, along with parent_col_id.
+    auto parent_col_id = column_parent_ids[this_col_id];
+    if (parent_col_id == parent_node_sentinel || column_categories[parent_col_id] == NC_LIST) {
+      if (is_array_of_arrays && parent_col_id == row_array_parent_col_id) {
+        name = column_names[this_col_id];
+      } else {
+        name = list_child_name;
+      }
+    } else if (column_categories[parent_col_id] == NC_FN) {
+      auto field_name_col_id = parent_col_id;
+      parent_col_id          = column_parent_ids[parent_col_id];
+      name                   = column_names[field_name_col_id];
+    }
+    // "name": type/schema
+    path.emplace_back(name, type);
+    this_col_id = parent_col_id;
+    if (this_col_id == row_array_parent_col_id) return path;
+  }
+  return {};
+}
+
+}  // namespace cudf::io::json::detail
@@ -2239,4 +2239,56 @@ TEST_F(JsonReaderTest, MixedTypes)
           expected_list);
 }
 
+TEST_F(JsonReaderTest, MapTypes)
+{
+  using cudf::type_id;
+  // Testing function for mixed types in JSON (for spark json reader)
+  auto test_fn = [](std::string_view json_string, bool lines, std::vector<type_id> types) {
+    std::map<std::string, cudf::io::schema_element> dtype_schema{
+      {"foo1", {data_type{type_id::STRING}}},  // list won't be a string
+      {"foo2", {data_type{type_id::STRING}}},  // struct forced as a string
+      {"1", {data_type{type_id::STRING}}},
+      {"2", {data_type{type_id::STRING}}},
+      {"bar", {dtype<int32_t>()}},
+    };
+
+    cudf::io::json_reader_options in_options =
+      cudf::io::json_reader_options::builder(
+        cudf::io::source_info{json_string.data(), json_string.size()})
+        .dtypes(dtype_schema)
+        .mixed_types_as_string(true)
+        .lines(lines);
+
+    cudf::io::table_with_metadata result = cudf::io::read_json(in_options);
+    EXPECT_EQ(result.tbl->num_columns(), types.size());
+    int i = 0;
+    for (auto& col : result.tbl->view()) {
+      EXPECT_EQ(col.type().id(), types[i]) << "column[" << i << "].type";
+      i++;
+    }
+    std::cout << "\n";
+  };
+
+  // json
+  test_fn(R"([{ "foo1": [1,2,3], "bar": 123 },
+              { "foo2": { "a": 1 }, "bar": 456 }])",
+          false,
+          {type_id::LIST, type_id::INT32, type_id::STRING});
+  // jsonl
+  test_fn(R"( { "foo1": [1,2,3], "bar": 123 }
+              { "foo2": { "a": 1 }, "bar": 456 })",
+          true,
+          {type_id::LIST, type_id::INT32, type_id::STRING});
+  // jsonl-array
+  test_fn(R"([123, [1,2,3]]
+              [456, null,  { "a": 1 }])",
+          true,
+          {type_id::INT64, type_id::LIST, type_id::STRING});
+  // json-array
+  test_fn(R"([[[1,2,3], null, 123],
+              [null, { "a": 1 }, 456 ]])",
+          false,
+          {type_id::LIST, type_id::STRING, type_id::STRING});
+}
+
 CUDF_TEST_PROGRAM_MAIN()