-
Notifications
You must be signed in to change notification settings - Fork 915
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Added streams to JSON reader and writer api #14313
Changes from 13 commits
13e5be3
c324948
e33752e
68f80a2
3b8e982
d3344c5
3b8b031
80c5a66
1147d6b
1d87bfc
12b3180
bcc226c
f89d83b
b219b13
13022da
d145e97
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change | ||||||||
---|---|---|---|---|---|---|---|---|---|---|
|
@@ -16,6 +16,7 @@ | |||||||||
|
||||||||||
#include <cudf_test/base_fixture.hpp> | ||||||||||
#include <cudf_test/column_wrapper.hpp> | ||||||||||
#include <cudf_test/default_stream.hpp> | ||||||||||
#include <cudf_test/iterator_utilities.hpp> | ||||||||||
|
||||||||||
#include <cudf/detail/iterator.cuh> | ||||||||||
|
@@ -49,22 +50,25 @@ TEST_F(JsonWriterTest, EmptyInput) | |||||||||
.build(); | ||||||||||
|
||||||||||
// Empty columns in table | ||||||||||
cudf::io::write_json(out_options, rmm::mr::get_current_device_resource()); | ||||||||||
cudf::io::write_json( | ||||||||||
out_options, cudf::test::get_default_stream(), rmm::mr::get_current_device_resource()); | ||||||||||
std::string const expected = R"([])"; | ||||||||||
EXPECT_EQ(expected, std::string(out_buffer.data(), out_buffer.size())); | ||||||||||
|
||||||||||
// Empty columns in table - JSON Lines | ||||||||||
out_buffer.clear(); | ||||||||||
out_options.enable_lines(true); | ||||||||||
cudf::io::write_json(out_options, rmm::mr::get_current_device_resource()); | ||||||||||
cudf::io::write_json( | ||||||||||
out_options, cudf::test::get_default_stream(), rmm::mr::get_current_device_resource()); | ||||||||||
std::string const expected_lines = "\n"; | ||||||||||
EXPECT_EQ(expected_lines, std::string(out_buffer.data(), out_buffer.size())); | ||||||||||
|
||||||||||
// Empty table - JSON Lines | ||||||||||
cudf::table_view tbl_view2{}; | ||||||||||
out_options.set_table(tbl_view2); | ||||||||||
out_buffer.clear(); | ||||||||||
cudf::io::write_json(out_options, rmm::mr::get_current_device_resource()); | ||||||||||
cudf::io::write_json( | ||||||||||
out_options, cudf::test::get_default_stream(), rmm::mr::get_current_device_resource()); | ||||||||||
EXPECT_EQ(expected_lines, std::string(out_buffer.data(), out_buffer.size())); | ||||||||||
} | ||||||||||
|
||||||||||
|
@@ -89,17 +93,22 @@ TEST_F(JsonWriterTest, ErrorCases) | |||||||||
.build(); | ||||||||||
|
||||||||||
// not enough column names | ||||||||||
EXPECT_THROW(cudf::io::write_json(out_options, rmm::mr::get_current_device_resource()), | ||||||||||
cudf::logic_error); | ||||||||||
EXPECT_THROW( | ||||||||||
cudf::io::write_json( | ||||||||||
out_options, cudf::test::get_default_stream(), rmm::mr::get_current_device_resource()), | ||||||||||
cudf::logic_error); | ||||||||||
|
||||||||||
mt.schema_info.emplace_back("int16"); | ||||||||||
out_options.set_metadata(mt); | ||||||||||
EXPECT_NO_THROW(cudf::io::write_json(out_options, rmm::mr::get_current_device_resource())); | ||||||||||
EXPECT_NO_THROW(cudf::io::write_json( | ||||||||||
out_options, cudf::test::get_default_stream(), rmm::mr::get_current_device_resource())); | ||||||||||
|
||||||||||
// chunk_rows must be at least 8 | ||||||||||
out_options.set_rows_per_chunk(0); | ||||||||||
EXPECT_THROW(cudf::io::write_json(out_options, rmm::mr::get_current_device_resource()), | ||||||||||
cudf::logic_error); | ||||||||||
EXPECT_THROW( | ||||||||||
cudf::io::write_json( | ||||||||||
out_options, cudf::test::get_default_stream(), rmm::mr::get_current_device_resource()), | ||||||||||
cudf::logic_error); | ||||||||||
} | ||||||||||
|
||||||||||
TEST_F(JsonWriterTest, PlainTable) | ||||||||||
|
@@ -121,7 +130,9 @@ TEST_F(JsonWriterTest, PlainTable) | |||||||||
.lines(false) | ||||||||||
.na_rep("null"); | ||||||||||
|
||||||||||
cudf::io::write_json(options_builder.build(), rmm::mr::get_current_device_resource()); | ||||||||||
cudf::io::write_json(options_builder.build(), | ||||||||||
cudf::test::get_default_stream(), | ||||||||||
rmm::mr::get_current_device_resource()); | ||||||||||
|
||||||||||
std::string const expected = | ||||||||||
R"([{"col1":"a","col2":"d","int":1,"float":1.5,"int16":null},{"col1":"b","col2":"e","int":2,"float":2.5,"int16":2},{"col1":"c","col2":"f","int":3,"float":3.5,"int16":null}])"; | ||||||||||
|
@@ -151,7 +162,9 @@ TEST_F(JsonWriterTest, SimpleNested) | |||||||||
.lines(true) | ||||||||||
.na_rep("null"); | ||||||||||
|
||||||||||
cudf::io::write_json(options_builder.build(), rmm::mr::get_current_device_resource()); | ||||||||||
cudf::io::write_json(options_builder.build(), | ||||||||||
cudf::test::get_default_stream(), | ||||||||||
rmm::mr::get_current_device_resource()); | ||||||||||
std::string const expected = R"({"a":1,"b":2,"c":{"d":3},"f":5.5,"g":[1]} | ||||||||||
{"a":6,"b":7,"c":{"d":8},"f":10.5} | ||||||||||
{"a":1,"b":2,"c":{"e":4},"f":5.5,"g":[2,null]} | ||||||||||
|
@@ -183,7 +196,9 @@ TEST_F(JsonWriterTest, MixedNested) | |||||||||
.lines(false) | ||||||||||
.na_rep("null"); | ||||||||||
|
||||||||||
cudf::io::write_json(options_builder.build(), rmm::mr::get_current_device_resource()); | ||||||||||
cudf::io::write_json(options_builder.build(), | ||||||||||
cudf::test::get_default_stream(), | ||||||||||
rmm::mr::get_current_device_resource()); | ||||||||||
std::string const expected = | ||||||||||
R"([{"a":1,"b":2,"c":{"d":[3]},"f":5.5,"g":[{"h":1}]},)" | ||||||||||
R"({"a":6,"b":7,"c":{"d":[8]},"f":10.5},)" | ||||||||||
|
@@ -216,7 +231,8 @@ TEST_F(JsonWriterTest, WriteReadNested) | |||||||||
.na_rep("null") | ||||||||||
.build(); | ||||||||||
|
||||||||||
cudf::io::write_json(out_options, rmm::mr::get_current_device_resource()); | ||||||||||
cudf::io::write_json( | ||||||||||
out_options, cudf::test::get_default_stream(), rmm::mr::get_current_device_resource()); | ||||||||||
std::string const expected = R"({"a":1,"b":2,"c":{"d":3},"f":5.5,"g":[1]} | ||||||||||
{"a":6,"b":7,"c":{"d":8},"f":10.5} | ||||||||||
{"a":1,"b":2,"c":{"e":4},"f":5.5,"g":[2,null]} | ||||||||||
|
@@ -291,7 +307,8 @@ TEST_F(JsonWriterTest, WriteReadNested) | |||||||||
mt.schema_info[2].children.clear(); | ||||||||||
out_options.set_metadata(mt); | ||||||||||
out_buffer.clear(); | ||||||||||
cudf::io::write_json(out_options, rmm::mr::get_current_device_resource()); | ||||||||||
cudf::io::write_json( | ||||||||||
out_options, cudf::test::get_default_stream(), rmm::mr::get_current_device_resource()); | ||||||||||
|
||||||||||
in_options = cudf::io::json_reader_options::builder( | ||||||||||
cudf::io::source_info{out_buffer.data(), out_buffer.size()}) | ||||||||||
|
@@ -314,7 +331,8 @@ TEST_F(JsonWriterTest, WriteReadNested) | |||||||||
// without column names | ||||||||||
out_options.set_metadata(cudf::io::table_metadata{}); | ||||||||||
out_buffer.clear(); | ||||||||||
cudf::io::write_json(out_options, rmm::mr::get_current_device_resource()); | ||||||||||
cudf::io::write_json( | ||||||||||
out_options, cudf::test::get_default_stream(), rmm::mr::get_current_device_resource()); | ||||||||||
in_options = cudf::io::json_reader_options::builder( | ||||||||||
cudf::io::source_info{out_buffer.data(), out_buffer.size()}) | ||||||||||
.lines(true) | ||||||||||
|
@@ -352,7 +370,8 @@ TEST_F(JsonWriterTest, SpecialChars) | |||||||||
.na_rep("null") | ||||||||||
.build(); | ||||||||||
|
||||||||||
cudf::io::write_json(out_options, rmm::mr::get_current_device_resource()); | ||||||||||
cudf::io::write_json( | ||||||||||
out_options, cudf::test::get_default_stream(), rmm::mr::get_current_device_resource()); | ||||||||||
std::string const expected = R"({"\"a\"":1,"'b'":"abcd"} | ||||||||||
{"\"a\"":6,"'b'":"b\b\f\n\r\t"} | ||||||||||
{"\"a\"":1,"'b'":"\"c\""} | ||||||||||
|
@@ -385,7 +404,9 @@ TEST_F(JsonWriterTest, NullList) | |||||||||
.lines(true) | ||||||||||
.na_rep("null"); | ||||||||||
|
||||||||||
cudf::io::write_json(options_builder.build(), rmm::mr::get_current_device_resource()); | ||||||||||
cudf::io::write_json(options_builder.build(), | ||||||||||
cudf::test::get_default_stream(), | ||||||||||
rmm::mr::get_current_device_resource()); | ||||||||||
std::string const expected = R"({"a":[null],"b":[[1,2,3],[null],[null,null,null],[4,null,5]]} | ||||||||||
{"a":[2,null,null,3],"b":null} | ||||||||||
{"a":[null,null,4],"b":[[2,null],null]} | ||||||||||
|
@@ -424,7 +445,9 @@ TEST_F(JsonWriterTest, ChunkedNested) | |||||||||
.na_rep("null") | ||||||||||
.rows_per_chunk(8); | ||||||||||
|
||||||||||
cudf::io::write_json(options_builder.build(), rmm::mr::get_current_device_resource()); | ||||||||||
cudf::io::write_json(options_builder.build(), | ||||||||||
cudf::test::get_default_stream(), | ||||||||||
rmm::mr::get_current_device_resource()); | ||||||||||
std::string const expected = | ||||||||||
R"({"a":1,"b":-2,"c":{},"e":[{"f":1}]} | ||||||||||
{"a":2,"b":-2,"c":{}} | ||||||||||
|
@@ -480,7 +503,9 @@ TEST_F(JsonWriterTest, StructAllNullCombinations) | |||||||||
.lines(true) | ||||||||||
.na_rep("null"); | ||||||||||
|
||||||||||
cudf::io::write_json(options_builder.build(), rmm::mr::get_current_device_resource()); | ||||||||||
cudf::io::write_json(options_builder.build(), | ||||||||||
cudf::test::get_default_stream(), | ||||||||||
rmm::mr::get_current_device_resource()); | ||||||||||
std::string const expected = R"({} | ||||||||||
{"e":1} | ||||||||||
{"d":1} | ||||||||||
|
@@ -542,7 +567,9 @@ TEST_F(JsonWriterTest, Unicode) | |||||||||
.lines(true) | ||||||||||
.na_rep("null"); | ||||||||||
|
||||||||||
cudf::io::write_json(options_builder.build(), rmm::mr::get_current_device_resource()); | ||||||||||
cudf::io::write_json(options_builder.build(), | ||||||||||
cudf::test::get_default_stream(), | ||||||||||
rmm::mr::get_current_device_resource()); | ||||||||||
Comment on lines
+570
to
+572
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. can these just be
Suggested change
We have the same values for the defaults. Not sure why There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. now I see the difference, cudf::get_default_stream vs cudf::test::get_default_stream There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes, the stream parameter can be left as is for now. For the moment, There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Thanks for the feedback! I've cleaned up the There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Not sure I understand what "as is" is in @vyasr 's comment, but it looks like we might be adding the stream parameter to all API calls in the future so we might as do it now. FWIW, the |
||||||||||
|
||||||||||
std::string const expected = | ||||||||||
R"({"col1":"\"\\\/\b\f\n\r\t","col2":"C\u10ae\u226a\u31f3\u434f\u51f9\u6ca6\u738b\u8fbf\u9fb8\ua057\ubbdc\uc2a4\ud3f6\ue4fe\ufd20","int16":null} | ||||||||||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,71 @@ | ||
/* | ||
* Copyright (c) 2023, NVIDIA CORPORATION. | ||
* | ||
* Licensed under the Apache License, Version 2.0 (the "License"); | ||
* you may not use this file except in compliance with the License. | ||
* You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, software | ||
* distributed under the License is distributed on an "AS IS" BASIS, | ||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
* See the License for the specific language governing permissions and | ||
* limitations under the License. | ||
*/ | ||
|
||
#include <cudf/io/detail/json.hpp> | ||
#include <cudf/io/json.hpp> | ||
#include <cudf/table/table.hpp> | ||
#include <cudf/table/table_view.hpp> | ||
#include <cudf/types.hpp> | ||
|
||
#include <cudf_test/base_fixture.hpp> | ||
#include <cudf_test/column_wrapper.hpp> | ||
#include <cudf_test/default_stream.hpp> | ||
#include <cudf_test/iterator_utilities.hpp> | ||
|
||
#include <string> | ||
#include <vector> | ||
|
||
class FunctionsTest : public cudf::test::BaseFixture {}; | ||
|
||
//.dtypes(std::vector<cudf::data_type>{dtype<int32_t>(), dtype<double>()}) | ||
TEST_F(FunctionsTest, JSONreader) | ||
{ | ||
std::string data = "[1, 1.1]\n[2, 2.2]\n[3, 3.3]\n"; | ||
cudf::io::json_reader_options in_options = | ||
cudf::io::json_reader_options::builder(cudf::io::source_info{data.data(), data.size()}) | ||
.dtypes(std::vector<cudf::data_type>{cudf::data_type{cudf::type_id::INT32}, | ||
cudf::data_type{cudf::type_id::FLOAT64}}) | ||
.lines(true) | ||
.legacy(true); | ||
cudf::io::table_with_metadata result = | ||
cudf::io::read_json(in_options, cudf::test::get_default_stream()); | ||
} | ||
|
||
TEST_F(FunctionsTest, JSONwriter) | ||
{ | ||
cudf::test::strings_column_wrapper col1{"a", "b", "c"}; | ||
cudf::test::strings_column_wrapper col2{"d", "e", "f"}; | ||
cudf::test::fixed_width_column_wrapper<int> col3{1, 2, 3}; | ||
cudf::test::fixed_width_column_wrapper<float> col4{1.5, 2.5, 3.5}; | ||
cudf::test::fixed_width_column_wrapper<int16_t> col5{{1, 2, 3}, | ||
cudf::test::iterators::nulls_at({0, 2})}; | ||
cudf::table_view tbl_view{{col1, col2, col3, col4, col5}}; | ||
cudf::io::table_metadata mt{{{"col1"}, {"col2"}, {"int"}, {"float"}, {"int16"}}}; | ||
/* | ||
cudf::table_view tbl_view{{col1}}; | ||
cudf::io::table_metadata mt{{{"col1"}}}; | ||
*/ | ||
|
||
std::vector<char> out_buffer; | ||
auto destination = cudf::io::sink_info(&out_buffer); | ||
auto options_builder = cudf::io::json_writer_options_builder(destination, tbl_view) | ||
.include_nulls(true) | ||
.metadata(mt) | ||
.lines(false) | ||
.na_rep("null"); | ||
|
||
cudf::io::write_json(options_builder.build(), cudf::test::get_default_stream()); | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
column_to_strings_fn
is not meant to be copied.I just added the patch to fix the copy constructor from being invoked. (please apply pre-commit hooks for clang-format, after applying this patch)
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
similarly other constructors and assignment operators can be deleted too.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thank you, @karthikeyann. The patch has been applied.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
TIL you can get the type dispatcher to take the functor by reference.
Thanks @karthikeyann!
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Very nice job on this, @karthikeyann. 👏