-
Notifications
You must be signed in to change notification settings - Fork 915
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
get_json_object() implementation (#7286)
An implementation of get_json_object(). Reference: https://cwiki.apache.org/confluence/display/Hive/LanguageManual+UDF#LanguageManualUDF-get_json_object The fundamental functionality here is running a JSONPath query on each row in an input column of json strings. JSONPath spec: https://tools.ietf.org/id/draft-goessner-dispatch-jsonpath-00.html For review purposes, the key entry point is `parse_json_path()`. Each thread of the kernel processes 1 row via this function. The behavior is recursive in nature but we maintain our own context stack to do it in loop fashion. `parse_json_path` is just the high level controlling logic, with most of the heavy lifting happening in the `json_state` parser class. Though the "heavy lifting" is pretty much just traditional string parsing code. The path to optimization here (I'll open a separate cudf issue for this) is - Change `parse_json_path` to work on a warp basis. So each row in the column would be processed by one warp. - Make the `json_state` parser class thread/warp aware (the class would just store its `tid` and operate accordingly). I think this is reasonably straightforward to do as most of the cuIO decoding kernels behave like this. Authors: - @nvdbaranec - Raza Jafri (@razajafri) Approvers: - Ray Douglass (@raydouglass) - Jason Lowe (@jlowe) - Jake Hemstad (@jrhemstad) - David (@davidwendt) URL: #7286
- Loading branch information
1 parent
be2f0c0
commit b937112
Showing
17 changed files
with
2,117 additions
and
78 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,140 @@ | ||
/* | ||
* Copyright (c) 2021, NVIDIA CORPORATION. | ||
* | ||
* Licensed under the Apache License, Version 2.0 (the "License"); | ||
* you may not use this file except in compliance with the License. | ||
* You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, software | ||
* distributed under the License is distributed on an "AS IS" BASIS, | ||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
* See the License for the specific language governing permissions and | ||
* limitations under the License. | ||
*/ | ||
|
||
#include <benchmark/benchmark.h> | ||
#include <benchmarks/common/generate_benchmark_input.hpp> | ||
#include <benchmarks/fixture/benchmark_fixture.hpp> | ||
#include <benchmarks/synchronization/synchronization.hpp> | ||
|
||
#include <cudf_test/base_fixture.hpp> | ||
#include <cudf_test/column_wrapper.hpp> | ||
|
||
#include <cudf/strings/json.hpp> | ||
#include <cudf/strings/strings_column_view.hpp> | ||
|
||
class JsonPath : public cudf::benchmark { | ||
}; | ||
|
||
float frand() { return static_cast<float>(rand()) / static_cast<float>(RAND_MAX); } | ||
|
||
int rand_range(int min, int max) { return min + static_cast<int>(frand() * (max - min)); } | ||
|
||
std::vector<std::string> Books{ | ||
"{\n\"category\": \"reference\",\n\"author\": \"Nigel Rees\",\n\"title\": \"Sayings of the " | ||
"Century\",\n\"price\": 8.95\n}", | ||
"{\n\"category\": \"fiction\",\n\"author\": \"Evelyn Waugh\",\n\"title\": \"Sword of " | ||
"Honour\",\n\"price\": 12.99\n}", | ||
"{\n\"category\": \"fiction\",\n\"author\": \"Herman Melville\",\n\"title\": \"Moby " | ||
"Dick\",\n\"isbn\": \"0-553-21311-3\",\n\"price\": 8.99\n}", | ||
"{\n\"category\": \"fiction\",\n\"author\": \"J. R. R. Tolkien\",\n\"title\": \"The Lord of the " | ||
"Rings\",\n\"isbn\": \"0-395-19395-8\",\n\"price\": 22.99\n}"}; | ||
constexpr int Approx_book_size = 110; | ||
std::vector<std::string> Bicycles{ | ||
"{\"color\": \"red\", \"price\": 9.95}", | ||
"{\"color\": \"green\", \"price\": 29.95}", | ||
"{\"color\": \"blue\", \"price\": 399.95}", | ||
"{\"color\": \"yellow\", \"price\": 99.95}", | ||
"{\"color\": \"mauve\", \"price\": 199.95}", | ||
}; | ||
constexpr int Approx_bicycle_size = 33; | ||
std::string Misc{"\n\"expensive\": 10\n"}; | ||
std::string generate_field(std::vector<std::string> const& values, int num_values) | ||
{ | ||
std::string res; | ||
for (int idx = 0; idx < num_values; idx++) { | ||
if (idx > 0) { res += std::string(",\n"); } | ||
int vindex = std::min(static_cast<int>(floor(frand() * values.size())), | ||
static_cast<int>(values.size() - 1)); | ||
res += values[vindex]; | ||
} | ||
return res; | ||
} | ||
|
||
std::string build_row(int desired_bytes) | ||
{ | ||
// always have at least 2 books and 2 bikes | ||
int num_books = 2; | ||
int num_bicycles = 2; | ||
int remaining_bytes = | ||
desired_bytes - ((num_books * Approx_book_size) + (num_bicycles * Approx_bicycle_size)); | ||
|
||
// divide up the remainder between books and bikes | ||
float book_pct = frand(); | ||
float bicycle_pct = 1.0f - book_pct; | ||
num_books += (remaining_bytes * book_pct) / Approx_book_size; | ||
num_bicycles += (remaining_bytes * bicycle_pct) / Approx_bicycle_size; | ||
|
||
std::string books = "\"book\": [\n" + generate_field(Books, num_books) + "]\n"; | ||
std::string bicycles = "\"bicycle\": [\n" + generate_field(Bicycles, num_bicycles) + "]\n"; | ||
|
||
std::string store = "\"store\": {\n"; | ||
if (frand() <= 0.5f) { | ||
store += books + std::string(",\n") + bicycles; | ||
} else { | ||
store += bicycles + std::string(",\n") + books; | ||
} | ||
store += std::string("}\n"); | ||
|
||
std::string row = std::string("{\n"); | ||
if (frand() <= 0.5f) { | ||
row += store + std::string(",\n") + Misc; | ||
} else { | ||
row += Misc + std::string(",\n") + store; | ||
} | ||
row += std::string("}\n"); | ||
return row; | ||
} | ||
|
||
template <class... QueryArg> | ||
static void BM_case(benchmark::State& state, QueryArg&&... query_arg) | ||
{ | ||
srand(5236); | ||
auto iter = thrust::make_transform_iterator( | ||
thrust::make_counting_iterator(0), | ||
[desired_bytes = state.range(1)](int index) { return build_row(desired_bytes); }); | ||
int num_rows = state.range(0); | ||
cudf::test::strings_column_wrapper input(iter, iter + num_rows); | ||
cudf::strings_column_view scv(input); | ||
size_t num_chars = scv.chars().size(); | ||
|
||
std::string json_path(query_arg...); | ||
|
||
for (auto _ : state) { | ||
cuda_event_timer raii(state, true, 0); | ||
auto result = cudf::strings::get_json_object(scv, json_path); | ||
cudaStreamSynchronize(0); | ||
} | ||
|
||
// this isn't strictly 100% accurate. a given query isn't necessarily | ||
// going to visit every single incoming character. but in spirit it does. | ||
state.SetBytesProcessed(state.iterations() * num_chars); | ||
} | ||
|
||
#define JSON_BENCHMARK_DEFINE(name, query) \ | ||
BENCHMARK_CAPTURE(BM_case, name, query) \ | ||
->ArgsProduct({{100, 1000, 100000, 400000}, {300, 600, 4096}}) \ | ||
->UseManualTime() \ | ||
->Unit(benchmark::kMillisecond); | ||
|
||
JSON_BENCHMARK_DEFINE(query0, "$"); | ||
JSON_BENCHMARK_DEFINE(query1, "$.store"); | ||
JSON_BENCHMARK_DEFINE(query2, "$.store.book"); | ||
JSON_BENCHMARK_DEFINE(query3, "$.store.*"); | ||
JSON_BENCHMARK_DEFINE(query4, "$.store.book[*]"); | ||
JSON_BENCHMARK_DEFINE(query5, "$.store.book[*].category"); | ||
JSON_BENCHMARK_DEFINE(query6, "$.store['bicycle']"); | ||
JSON_BENCHMARK_DEFINE(query7, "$.store.book[*]['isbn']"); | ||
JSON_BENCHMARK_DEFINE(query8, "$.store.bicycle[1]"); |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,40 @@ | ||
/* | ||
* Copyright (c) 2021, NVIDIA CORPORATION. | ||
* | ||
* Licensed under the Apache License, Version 2.0 (the "License"); | ||
* you may not use this file except in compliance with the License. | ||
* You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, software | ||
* distributed under the License is distributed on an "AS IS" BASIS, | ||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
* See the License for the specific language governing permissions and | ||
* limitations under the License. | ||
*/ | ||
|
||
#pragma once | ||
|
||
#include <cudf/strings/strings_column_view.hpp> | ||
|
||
#include <rmm/cuda_stream_view.hpp> | ||
|
||
namespace cudf { | ||
namespace strings { | ||
namespace detail { | ||
|
||
/** | ||
* @copydoc cudf::strings::get_json_object | ||
* | ||
* @param stream CUDA stream used for device memory operations and kernel launches | ||
*/ | ||
std::unique_ptr<cudf::column> get_json_object( | ||
cudf::strings_column_view const& col, | ||
cudf::string_scalar const& json_path, | ||
rmm::cuda_stream_view stream = rmm::cuda_stream_default, | ||
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); | ||
|
||
} // namespace detail | ||
} // namespace strings | ||
} // namespace cudf |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,50 @@ | ||
/* | ||
* Copyright (c) 2019-2021, NVIDIA CORPORATION. | ||
* | ||
* Licensed under the Apache License, Version 2.0 (the "License"); | ||
* you may not use this file except in compliance with the License. | ||
* You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, software | ||
* distributed under the License is distributed on an "AS IS" BASIS, | ||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
* See the License for the specific language governing permissions and | ||
* limitations under the License. | ||
*/ | ||
#pragma once | ||
|
||
#include <cudf/strings/strings_column_view.hpp> | ||
|
||
namespace cudf { | ||
namespace strings { | ||
|
||
/** | ||
* @addtogroup strings_json | ||
* @{ | ||
* @file | ||
*/ | ||
|
||
/** | ||
* @brief Apply a JSONPath string to all rows in an input strings column. | ||
* | ||
* Applies a JSONPath string to an incoming strings column where each row in the column | ||
* is a valid json string. The output is returned by row as a strings column. | ||
* | ||
* https://tools.ietf.org/id/draft-goessner-dispatch-jsonpath-00.html | ||
* Implements only the operators: $ . [] * | ||
* | ||
* @param col The input strings column. Each row must contain a valid json string | ||
* @param json_path The JSONPath string to be applied to each row | ||
* @param mr Resource for allocating device memory. | ||
* @return New strings column containing the retrieved json object strings | ||
*/ | ||
std::unique_ptr<cudf::column> get_json_object( | ||
cudf::strings_column_view const& col, | ||
cudf::string_scalar const& json_path, | ||
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); | ||
|
||
/** @} */ // end of doxygen group | ||
} // namespace strings | ||
} // namespace cudf |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.