-
Notifications
You must be signed in to change notification settings - Fork 914
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Support casting of Map type to string in JSON reader #14936
Changes from all commits
febcfff
11aa95b
a3cbc4f
ecf4e13
56585c5
e083b4a
9e07e0d
56dacca
4cb8673
55ef545
9ce7875
ce16a53
70dbc31
2ffdc8a
0535d21
4a432cd
4cd6150
6add1c7
53cba69
3cc7403
b44f52c
3df4c41
2f0a2e0
3c394e8
9e7abd7
e600fb5
afbdcb8
107259a
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -307,6 +307,32 @@ table_with_metadata device_parse_nested_json(device_span<SymbolT const> input, | |
rmm::cuda_stream_view stream, | ||
rmm::mr::device_memory_resource* mr); | ||
|
||
/** | ||
* @brief Get the path data type of a column by path if present in input schema | ||
* | ||
* @param path path of the column | ||
* @param options json reader options which holds schema | ||
* @return data type of the column if present | ||
*/ | ||
std::optional<data_type> get_path_data_type( | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can we have this as a member of the There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
|
||
host_span<std::pair<std::string, cudf::io::json::NodeT>> path, | ||
cudf::io::json_reader_options const& options); | ||
|
||
/** | ||
* @brief Helper class to get path of a column by column id from reduced column tree | ||
* | ||
*/ | ||
struct path_from_tree { | ||
host_span<NodeT const> column_categories; | ||
host_span<NodeIndexT const> column_parent_ids; | ||
host_span<std::string const> column_names; | ||
bool is_array_of_arrays; | ||
NodeIndexT const row_array_parent_col_id; | ||
|
||
using path_rep = std::pair<std::string, cudf::io::json::NodeT>; | ||
std::vector<path_rep> get_path(NodeIndexT this_col_id); | ||
}; | ||
|
||
/** | ||
* @brief Parses the given JSON string and generates table from the given input. | ||
* | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,126 @@ | ||
/* | ||
* Copyright (c) 2024, NVIDIA CORPORATION. | ||
* | ||
* Licensed under the Apache License, Version 2.0 (the "License"); | ||
* you may not use this file except in compliance with the License. | ||
* You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, software | ||
* distributed under the License is distributed on an "AS IS" BASIS, | ||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
* See the License for the specific language governing permissions and | ||
* limitations under the License. | ||
*/ | ||
|
||
#include "nested_json.hpp" | ||
|
||
#include <cudf/detail/utilities/visitor_overload.hpp> | ||
|
||
#include <optional> | ||
#include <string> | ||
#include <vector> | ||
|
||
namespace cudf::io::json::detail { | ||
|
||
std::optional<schema_element> child_schema_element(std::string const& col_name, | ||
cudf::io::json_reader_options const& options) | ||
{ | ||
return std::visit( | ||
cudf::detail::visitor_overload{ | ||
[col_name](std::vector<data_type> const& user_dtypes) -> std::optional<schema_element> { | ||
auto column_index = atol(col_name.data()); | ||
return (static_cast<std::size_t>(column_index) < user_dtypes.size()) | ||
? std::optional<schema_element>{{user_dtypes[column_index]}} | ||
: std::optional<schema_element>{}; | ||
}, | ||
[col_name]( | ||
std::map<std::string, data_type> const& user_dtypes) -> std::optional<schema_element> { | ||
return (user_dtypes.find(col_name) != std::end(user_dtypes)) | ||
? std::optional<schema_element>{{user_dtypes.find(col_name)->second}} | ||
: std::optional<schema_element>{}; | ||
}, | ||
[col_name]( | ||
std::map<std::string, schema_element> const& user_dtypes) -> std::optional<schema_element> { | ||
return (user_dtypes.find(col_name) != std::end(user_dtypes)) | ||
? user_dtypes.find(col_name)->second | ||
: std::optional<schema_element>{}; | ||
}}, | ||
options.get_dtypes()); | ||
} | ||
|
||
// example schema and its path. | ||
// "a": int {"a", int} | ||
// "a": [ int ] {"a", list}, {"element", int} | ||
// "a": { "b": int} {"a", struct}, {"b", int} | ||
// "a": [ {"b": int }] {"a", list}, {"element", struct}, {"b", int} | ||
// "a": [ null] {"a", list}, {"element", str} | ||
// back() is root. | ||
// front() is leaf. | ||
std::optional<data_type> get_path_data_type( | ||
host_span<std::pair<std::string, cudf::io::json::NodeT>> path, schema_element const& root) | ||
{ | ||
if (path.empty() || path.size() == 1) { | ||
return root.type; | ||
} else { | ||
if (path.back().second == NC_STRUCT && root.type.id() == type_id::STRUCT) { | ||
auto const child_name = path.first(path.size() - 1).back().first; | ||
auto const child_schema_it = root.child_types.find(child_name); | ||
return (child_schema_it != std::end(root.child_types)) | ||
? get_path_data_type(path.first(path.size() - 1), child_schema_it->second) | ||
: std::optional<data_type>{}; | ||
} else if (path.back().second == NC_LIST && root.type.id() == type_id::LIST) { | ||
auto const child_schema_it = root.child_types.find(list_child_name); | ||
return (child_schema_it != std::end(root.child_types)) | ||
? get_path_data_type(path.first(path.size() - 1), child_schema_it->second) | ||
: std::optional<data_type>{}; | ||
} | ||
return std::optional<data_type>{}; | ||
} | ||
} | ||
|
||
std::optional<data_type> get_path_data_type( | ||
host_span<std::pair<std::string, cudf::io::json::NodeT>> path, | ||
cudf::io::json_reader_options const& options) | ||
{ | ||
if (path.empty()) return {}; | ||
std::optional<schema_element> col_schema = child_schema_element(path.back().first, options); | ||
// check if it has value, then do recursive call and return. | ||
if (col_schema.has_value()) { | ||
return get_path_data_type(path, col_schema.value()); | ||
} else { | ||
return {}; | ||
} | ||
} | ||
|
||
// idea: write a memoizer using template and lambda?, then call recursively. | ||
std::vector<path_from_tree::path_rep> path_from_tree::get_path(NodeIndexT this_col_id) | ||
{ | ||
std::vector<path_rep> path; | ||
// TODO Need to stop at row root. so, how to find row root? | ||
while (this_col_id != parent_node_sentinel) { | ||
auto type = column_categories[this_col_id]; | ||
std::string name = ""; | ||
// TODO make this ifelse into a separate lambda function, along with parent_col_id. | ||
auto parent_col_id = column_parent_ids[this_col_id]; | ||
if (parent_col_id == parent_node_sentinel || column_categories[parent_col_id] == NC_LIST) { | ||
if (is_array_of_arrays && parent_col_id == row_array_parent_col_id) { | ||
name = column_names[this_col_id]; | ||
} else { | ||
name = list_child_name; | ||
} | ||
} else if (column_categories[parent_col_id] == NC_FN) { | ||
auto field_name_col_id = parent_col_id; | ||
parent_col_id = column_parent_ids[parent_col_id]; | ||
name = column_names[field_name_col_id]; | ||
} | ||
// "name": type/schema | ||
path.emplace_back(name, type); | ||
this_col_id = parent_col_id; | ||
if (this_col_id == row_array_parent_col_id) return path; | ||
} | ||
return {}; | ||
} | ||
|
||
} // namespace cudf::io::json::detail |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -2239,4 +2239,56 @@ TEST_F(JsonReaderTest, MixedTypes) | |
expected_list); | ||
} | ||
|
||
TEST_F(JsonReaderTest, MapTypes) | ||
{ | ||
using cudf::type_id; | ||
// Testing function for mixed types in JSON (for spark json reader) | ||
auto test_fn = [](std::string_view json_string, bool lines, std::vector<type_id> types) { | ||
std::map<std::string, cudf::io::schema_element> dtype_schema{ | ||
{"foo1", {data_type{type_id::STRING}}}, // list won't be a string | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can I understand why a list will not be returned as a string? I know that this might not be in the requirements, but if I ask for a nested type to be a string, I really would like for it to be returned as a string no matter what. I'm not 100% sure how difficult that is to pull off though. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I restricted
It's easy to implement; in cpp/src/io/json/json_column.cu:715 , we need add LIST type also in the condition. Would you prefer to add LIST too? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think this is a missed requirement. Sorry about that. In Spark if I ask for a string and it sees a list or a struct, or really just about any type, it is returned as a string in a very similar way to how the mixed type support works. So yes I would love it if we could get this to work for both list and struct types. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. But I will add that if you want to do it as a separate issue, because it is a missed requirement on our part I am fine with that. It is about what ever is simpler for you to do. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. No worries @revans2 . We could do it as a separate PR to test it extensively. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I filed #15278 for this |
||
{"foo2", {data_type{type_id::STRING}}}, // struct forced as a string | ||
{"1", {data_type{type_id::STRING}}}, | ||
{"2", {data_type{type_id::STRING}}}, | ||
{"bar", {dtype<int32_t>()}}, | ||
}; | ||
|
||
cudf::io::json_reader_options in_options = | ||
cudf::io::json_reader_options::builder( | ||
cudf::io::source_info{json_string.data(), json_string.size()}) | ||
.dtypes(dtype_schema) | ||
.mixed_types_as_string(true) | ||
.lines(lines); | ||
|
||
cudf::io::table_with_metadata result = cudf::io::read_json(in_options); | ||
EXPECT_EQ(result.tbl->num_columns(), types.size()); | ||
int i = 0; | ||
for (auto& col : result.tbl->view()) { | ||
EXPECT_EQ(col.type().id(), types[i]) << "column[" << i << "].type"; | ||
i++; | ||
} | ||
std::cout << "\n"; | ||
}; | ||
|
||
// json | ||
test_fn(R"([{ "foo1": [1,2,3], "bar": 123 }, | ||
{ "foo2": { "a": 1 }, "bar": 456 }])", | ||
false, | ||
{type_id::LIST, type_id::INT32, type_id::STRING}); | ||
// jsonl | ||
test_fn(R"( { "foo1": [1,2,3], "bar": 123 } | ||
{ "foo2": { "a": 1 }, "bar": 456 })", | ||
true, | ||
{type_id::LIST, type_id::INT32, type_id::STRING}); | ||
// jsonl-array | ||
test_fn(R"([123, [1,2,3]] | ||
[456, null, { "a": 1 }])", | ||
true, | ||
{type_id::INT64, type_id::LIST, type_id::STRING}); | ||
// json-array | ||
test_fn(R"([[[1,2,3], null, 123], | ||
[null, { "a": 1 }, 456 ]])", | ||
false, | ||
{type_id::LIST, type_id::STRING, type_id::STRING}); | ||
} | ||
|
||
CUDF_TEST_PROGRAM_MAIN() |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
If we need to know the tree path only for mixed types, can we create the object only when the option is enabled?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The object is light weight. It holds span and couple of primitives. So, it may not matter much if the suggestion is for reducing runtime or memory.
I added the struct because in future it can have a memoizer.