Skip to content

Commit

Permalink
get_json_object() implementation (rapidsai#7286)
Browse files Browse the repository at this point in the history
An implementation of get_json_object(). 

Reference:  https://cwiki.apache.org/confluence/display/Hive/LanguageManual+UDF#LanguageManualUDF-get_json_object

The fundamental functionality here is running a JSONPath query on each row in an input column of json strings. 

JSONPath spec:  https://tools.ietf.org/id/draft-goessner-dispatch-jsonpath-00.html

For review purposes, the key entry point is `parse_json_path()`.  Each thread of the kernel processes 1 row via this function.  The behavior is recursive in nature but we maintain our own context stack to do it in loop fashion. 

`parse_json_path` is just the high level controlling logic, with most of the heavy lifting happening in the `json_state` parser class. Though the "heavy lifting" is pretty much just traditional string parsing code.

The path to optimization here (I'll open a separate cudf issue for this) is
- Change `parse_json_path` to work on a warp basis.  So each row in the column would be processed by one warp.
- Make the `json_state` parser class thread/warp aware (the class would just store its `tid` and operate accordingly).  I think this is reasonably straightforward to do as most of the cuIO decoding kernels behave like this.

Authors:
  - @nvdbaranec
  - Raza Jafri (@razajafri)

Approvers:
  - Ray Douglass (@raydouglass)
  - Jason Lowe (@jlowe)
  - Jake Hemstad (@jrhemstad)
  - David (@davidwendt)

URL: rapidsai#7286
  • Loading branch information
nvdbaranec authored Mar 31, 2021
1 parent be2f0c0 commit b937112
Show file tree
Hide file tree
Showing 17 changed files with 2,117 additions and 78 deletions.
2 changes: 2 additions & 0 deletions conda/recipes/libcudf/meta.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -178,12 +178,14 @@ test:
- test -f $PREFIX/include/cudf/strings/detail/converters.hpp
- test -f $PREFIX/include/cudf/strings/detail/copying.hpp
- test -f $PREFIX/include/cudf/strings/detail/fill.hpp
- test -f $PREFIX/include/cudf/strings/detail/json.hpp
- test -f $PREFIX/include/cudf/strings/detail/replace.hpp
- test -f $PREFIX/include/cudf/strings/detail/utilities.hpp
- test -f $PREFIX/include/cudf/strings/extract.hpp
- test -f $PREFIX/include/cudf/strings/findall.hpp
- test -f $PREFIX/include/cudf/strings/find.hpp
- test -f $PREFIX/include/cudf/strings/find_multiple.hpp
- test -f $PREFIX/include/cudf/strings/json.hpp
- test -f $PREFIX/include/cudf/strings/padding.hpp
- test -f $PREFIX/include/cudf/strings/replace.hpp
- test -f $PREFIX/include/cudf/strings/replace_re.hpp
Expand Down
1 change: 1 addition & 0 deletions cpp/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -346,6 +346,7 @@ add_library(cudf
src/strings/find.cu
src/strings/find_multiple.cu
src/strings/padding.cu
src/strings/json/json_path.cu
src/strings/regex/regcomp.cpp
src/strings/regex/regexec.cu
src/strings/replace/backref_re.cu
Expand Down
5 changes: 5 additions & 0 deletions cpp/benchmarks/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -202,3 +202,8 @@ ConfigureBench(STRINGS_BENCH
string/substring_benchmark.cpp
string/translate_benchmark.cpp
string/url_decode_benchmark.cpp)

###################################################################################################
# - json benchmark -------------------------------------------------------------------
ConfigureBench(JSON_BENCH
string/json_benchmark.cpp)
140 changes: 140 additions & 0 deletions cpp/benchmarks/string/json_benchmark.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,140 @@
/*
* Copyright (c) 2021, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#include <benchmark/benchmark.h>
#include <benchmarks/common/generate_benchmark_input.hpp>
#include <benchmarks/fixture/benchmark_fixture.hpp>
#include <benchmarks/synchronization/synchronization.hpp>

#include <cudf_test/base_fixture.hpp>
#include <cudf_test/column_wrapper.hpp>

#include <cudf/strings/json.hpp>
#include <cudf/strings/strings_column_view.hpp>

class JsonPath : public cudf::benchmark {
};

float frand() { return static_cast<float>(rand()) / static_cast<float>(RAND_MAX); }

int rand_range(int min, int max) { return min + static_cast<int>(frand() * (max - min)); }

std::vector<std::string> Books{
"{\n\"category\": \"reference\",\n\"author\": \"Nigel Rees\",\n\"title\": \"Sayings of the "
"Century\",\n\"price\": 8.95\n}",
"{\n\"category\": \"fiction\",\n\"author\": \"Evelyn Waugh\",\n\"title\": \"Sword of "
"Honour\",\n\"price\": 12.99\n}",
"{\n\"category\": \"fiction\",\n\"author\": \"Herman Melville\",\n\"title\": \"Moby "
"Dick\",\n\"isbn\": \"0-553-21311-3\",\n\"price\": 8.99\n}",
"{\n\"category\": \"fiction\",\n\"author\": \"J. R. R. Tolkien\",\n\"title\": \"The Lord of the "
"Rings\",\n\"isbn\": \"0-395-19395-8\",\n\"price\": 22.99\n}"};
constexpr int Approx_book_size = 110;
std::vector<std::string> Bicycles{
"{\"color\": \"red\", \"price\": 9.95}",
"{\"color\": \"green\", \"price\": 29.95}",
"{\"color\": \"blue\", \"price\": 399.95}",
"{\"color\": \"yellow\", \"price\": 99.95}",
"{\"color\": \"mauve\", \"price\": 199.95}",
};
constexpr int Approx_bicycle_size = 33;
std::string Misc{"\n\"expensive\": 10\n"};
std::string generate_field(std::vector<std::string> const& values, int num_values)
{
std::string res;
for (int idx = 0; idx < num_values; idx++) {
if (idx > 0) { res += std::string(",\n"); }
int vindex = std::min(static_cast<int>(floor(frand() * values.size())),
static_cast<int>(values.size() - 1));
res += values[vindex];
}
return res;
}

std::string build_row(int desired_bytes)
{
// always have at least 2 books and 2 bikes
int num_books = 2;
int num_bicycles = 2;
int remaining_bytes =
desired_bytes - ((num_books * Approx_book_size) + (num_bicycles * Approx_bicycle_size));

// divide up the remainder between books and bikes
float book_pct = frand();
float bicycle_pct = 1.0f - book_pct;
num_books += (remaining_bytes * book_pct) / Approx_book_size;
num_bicycles += (remaining_bytes * bicycle_pct) / Approx_bicycle_size;

std::string books = "\"book\": [\n" + generate_field(Books, num_books) + "]\n";
std::string bicycles = "\"bicycle\": [\n" + generate_field(Bicycles, num_bicycles) + "]\n";

std::string store = "\"store\": {\n";
if (frand() <= 0.5f) {
store += books + std::string(",\n") + bicycles;
} else {
store += bicycles + std::string(",\n") + books;
}
store += std::string("}\n");

std::string row = std::string("{\n");
if (frand() <= 0.5f) {
row += store + std::string(",\n") + Misc;
} else {
row += Misc + std::string(",\n") + store;
}
row += std::string("}\n");
return row;
}

template <class... QueryArg>
static void BM_case(benchmark::State& state, QueryArg&&... query_arg)
{
srand(5236);
auto iter = thrust::make_transform_iterator(
thrust::make_counting_iterator(0),
[desired_bytes = state.range(1)](int index) { return build_row(desired_bytes); });
int num_rows = state.range(0);
cudf::test::strings_column_wrapper input(iter, iter + num_rows);
cudf::strings_column_view scv(input);
size_t num_chars = scv.chars().size();

std::string json_path(query_arg...);

for (auto _ : state) {
cuda_event_timer raii(state, true, 0);
auto result = cudf::strings::get_json_object(scv, json_path);
cudaStreamSynchronize(0);
}

// this isn't strictly 100% accurate. a given query isn't necessarily
// going to visit every single incoming character. but in spirit it does.
state.SetBytesProcessed(state.iterations() * num_chars);
}

#define JSON_BENCHMARK_DEFINE(name, query) \
BENCHMARK_CAPTURE(BM_case, name, query) \
->ArgsProduct({{100, 1000, 100000, 400000}, {300, 600, 4096}}) \
->UseManualTime() \
->Unit(benchmark::kMillisecond);

JSON_BENCHMARK_DEFINE(query0, "$");
JSON_BENCHMARK_DEFINE(query1, "$.store");
JSON_BENCHMARK_DEFINE(query2, "$.store.book");
JSON_BENCHMARK_DEFINE(query3, "$.store.*");
JSON_BENCHMARK_DEFINE(query4, "$.store.book[*]");
JSON_BENCHMARK_DEFINE(query5, "$.store.book[*].category");
JSON_BENCHMARK_DEFINE(query6, "$.store['bicycle']");
JSON_BENCHMARK_DEFINE(query7, "$.store.book[*]['isbn']");
JSON_BENCHMARK_DEFINE(query8, "$.store.bicycle[1]");
40 changes: 40 additions & 0 deletions cpp/include/cudf/strings/detail/json.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
/*
* Copyright (c) 2021, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#pragma once

#include <cudf/strings/strings_column_view.hpp>

#include <rmm/cuda_stream_view.hpp>

namespace cudf {
namespace strings {
namespace detail {

/**
* @copydoc cudf::strings::get_json_object
*
* @param stream CUDA stream used for device memory operations and kernel launches
*/
std::unique_ptr<cudf::column> get_json_object(
cudf::strings_column_view const& col,
cudf::string_scalar const& json_path,
rmm::cuda_stream_view stream = rmm::cuda_stream_default,
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());

} // namespace detail
} // namespace strings
} // namespace cudf
50 changes: 50 additions & 0 deletions cpp/include/cudf/strings/json.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
/*
* Copyright (c) 2019-2021, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once

#include <cudf/strings/strings_column_view.hpp>

namespace cudf {
namespace strings {

/**
* @addtogroup strings_json
* @{
* @file
*/

/**
* @brief Apply a JSONPath string to all rows in an input strings column.
*
* Applies a JSONPath string to an incoming strings column where each row in the column
* is a valid json string. The output is returned by row as a strings column.
*
* https://tools.ietf.org/id/draft-goessner-dispatch-jsonpath-00.html
* Implements only the operators: $ . [] *
*
* @param col The input strings column. Each row must contain a valid json string
* @param json_path The JSONPath string to be applied to each row
* @param mr Resource for allocating device memory.
* @return New strings column containing the retrieved json object strings
*/
std::unique_ptr<cudf::column> get_json_object(
cudf::strings_column_view const& col,
cudf::string_scalar const& json_path,
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());

/** @} */ // end of doxygen group
} // namespace strings
} // namespace cudf
1 change: 1 addition & 0 deletions cpp/include/doxygen_groups.h
Original file line number Diff line number Diff line change
Expand Up @@ -127,6 +127,7 @@
* @defgroup strings_modify Modifying
* @defgroup strings_replace Replacing
* @defgroup strings_split Splitting
* @defgroup strings_json JSON
* @}
* @defgroup dictionary_apis Dictionary
* @{
Expand Down
6 changes: 3 additions & 3 deletions cpp/src/io/csv/csv_gpu.cu
Original file line number Diff line number Diff line change
Expand Up @@ -196,7 +196,7 @@ __global__ void __launch_bounds__(csvparse_block_dim)
} else if (serialized_trie_contains(opts.trie_true, {field_start, field_len}) ||
serialized_trie_contains(opts.trie_false, {field_start, field_len})) {
atomicAdd(&d_columnData[actual_col].bool_count, 1);
} else if (cudf::io::gpu::is_infinity(field_start, next_delimiter)) {
} else if (cudf::io::is_infinity(field_start, next_delimiter)) {
atomicAdd(&d_columnData[actual_col].float_count, 1);
} else {
long countNumber = 0;
Expand Down Expand Up @@ -277,15 +277,15 @@ __inline__ __device__ T decode_value(char const *begin,
char const *end,
parse_options_view const &opts)
{
return cudf::io::gpu::parse_numeric<T, base>(begin, end, opts);
return cudf::io::parse_numeric<T, base>(begin, end, opts);
}

template <typename T>
__inline__ __device__ T decode_value(char const *begin,
char const *end,
parse_options_view const &opts)
{
return cudf::io::gpu::parse_numeric<T>(begin, end, opts);
return cudf::io::parse_numeric<T>(begin, end, opts);
}

template <>
Expand Down
4 changes: 2 additions & 2 deletions cpp/src/io/json/json_gpu.cu
Original file line number Diff line number Diff line change
Expand Up @@ -114,7 +114,7 @@ __inline__ __device__ T decode_value(const char *begin,
uint64_t end,
parse_options_view const &opts)
{
return cudf::io::gpu::parse_numeric<T, base>(begin, end, opts);
return cudf::io::parse_numeric<T, base>(begin, end, opts);
}

/**
Expand All @@ -131,7 +131,7 @@ __inline__ __device__ T decode_value(const char *begin,
const char *end,
parse_options_view const &opts)
{
return cudf::io::gpu::parse_numeric<T>(begin, end, opts);
return cudf::io::parse_numeric<T>(begin, end, opts);
}

/**
Expand Down
Loading

0 comments on commit b937112

Please sign in to comment.