-
Notifications
You must be signed in to change notification settings - Fork 915
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Depends on #11242 Feature/finite state transducer Benchmark for Finite State Transducer parse and identify JSON symbols - [x] FST with output, output index, output str - [x] FST without output index - [x] FST without, output - [x] FST without output str Look into elstehle#1 for files modified only in this PR (i.e excluding parent depending PR) Authors: - Karthikeyan (https://github.com/karthikeyann) - Elias Stehle (https://github.com/elstehle) Approvers: - Yunsong Wang (https://github.com/PointKernel) - Elias Stehle (https://github.com/elstehle) URL: #11243
- Loading branch information
1 parent
2d214ea
commit 586da32
Showing
4 changed files
with
312 additions
and
50 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,235 @@ | ||
/* | ||
* Copyright (c) 2022, NVIDIA CORPORATION. | ||
* | ||
* Licensed under the Apache License, Version 2.0 (the "License"); | ||
* you may not use this file except in compliance with the License. | ||
* You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, software | ||
* distributed under the License is distributed on an "AS IS" BASIS, | ||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
* See the License for the specific language governing permissions and | ||
* limitations under the License. | ||
*/ | ||
|
||
#include <benchmarks/common/generate_input.hpp> | ||
#include <benchmarks/fixture/rmm_pool_raii.hpp> | ||
#include <nvbench/nvbench.cuh> | ||
|
||
#include <io/fst/lookup_tables.cuh> | ||
#include <io/utilities/hostdevice_vector.hpp> //TODO find better replacement | ||
|
||
#include <tests/io/fst/common.hpp> | ||
|
||
#include <cudf/scalar/scalar_factories.hpp> | ||
#include <cudf/strings/repeat_strings.hpp> | ||
#include <cudf/types.hpp> | ||
|
||
#include <rmm/cuda_stream.hpp> | ||
#include <rmm/cuda_stream_view.hpp> | ||
#include <rmm/device_buffer.hpp> | ||
#include <rmm/device_uvector.hpp> | ||
|
||
#include <thrust/iterator/discard_iterator.h> | ||
|
||
#include <cstdlib> | ||
|
||
namespace cudf { | ||
namespace { | ||
auto make_test_json_data(nvbench::state& state) | ||
{ | ||
auto const string_size{size_type(state.get_int64("string_size"))}; | ||
|
||
// Test input | ||
std::string input = R"( {)" | ||
R"("category": "reference",)" | ||
R"("index:" [4,12,42],)" | ||
R"("author": "Nigel Rees",)" | ||
R"("title": "Sayings of the Century",)" | ||
R"("price": 8.95)" | ||
R"(} )" | ||
R"({)" | ||
R"("category": "reference",)" | ||
R"("index:" [4,{},null,{"a":[]}],)" | ||
R"("author": "Nigel Rees",)" | ||
R"("title": "Sayings of the Century",)" | ||
R"("price": 8.95)" | ||
R"(} {} [] [ ])"; | ||
|
||
auto d_input_scalar = cudf::make_string_scalar(input); | ||
auto& d_string_scalar = static_cast<cudf::string_scalar&>(*d_input_scalar); | ||
const size_type repeat_times = string_size / input.size(); | ||
return cudf::strings::repeat_string(d_string_scalar, repeat_times); | ||
} | ||
|
||
using namespace cudf::test::io::json; | ||
// Type used to represent the atomic symbol type used within the finite-state machine | ||
using SymbolT = char; | ||
// Type sufficiently large to index symbols within the input and output (may be unsigned) | ||
using SymbolOffsetT = uint32_t; | ||
// Helper class to set up transition table, symbol group lookup table, and translation table | ||
using DfaFstT = cudf::io::fst::detail::Dfa<char, NUM_SYMBOL_GROUPS, TT_NUM_STATES>; | ||
constexpr std::size_t single_item = 1; | ||
|
||
} // namespace | ||
|
||
void BM_FST_JSON(nvbench::state& state) | ||
{ | ||
// TODO: to be replaced by nvbench fixture once it's ready | ||
cudf::rmm_pool_raii rmm_pool; | ||
|
||
auto const string_size{size_type(state.get_int64("string_size"))}; | ||
// Prepare cuda stream for data transfers & kernels | ||
rmm::cuda_stream stream{}; | ||
rmm::cuda_stream_view stream_view(stream); | ||
|
||
auto input_string = make_test_json_data(state); | ||
auto& d_input = static_cast<cudf::scalar_type_t<std::string>&>(*input_string); | ||
|
||
state.add_element_count(d_input.size()); | ||
|
||
// Prepare input & output buffers | ||
hostdevice_vector<SymbolT> output_gpu(d_input.size(), stream_view); | ||
hostdevice_vector<SymbolOffsetT> output_gpu_size(single_item, stream_view); | ||
hostdevice_vector<SymbolOffsetT> out_indexes_gpu(d_input.size(), stream_view); | ||
|
||
// Run algorithm | ||
DfaFstT parser{pda_sgs, pda_state_tt, pda_out_tt, stream.value()}; | ||
|
||
state.set_cuda_stream(nvbench::make_cuda_stream_view(stream.value())); | ||
state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { | ||
// Allocate device-side temporary storage & run algorithm | ||
parser.Transduce(d_input.data(), | ||
static_cast<SymbolOffsetT>(d_input.size()), | ||
output_gpu.device_ptr(), | ||
out_indexes_gpu.device_ptr(), | ||
output_gpu_size.device_ptr(), | ||
start_state, | ||
stream.value()); | ||
}); | ||
} | ||
|
||
void BM_FST_JSON_no_outidx(nvbench::state& state) | ||
{ | ||
// TODO: to be replaced by nvbench fixture once it's ready | ||
cudf::rmm_pool_raii rmm_pool; | ||
|
||
auto const string_size{size_type(state.get_int64("string_size"))}; | ||
// Prepare cuda stream for data transfers & kernels | ||
rmm::cuda_stream stream{}; | ||
rmm::cuda_stream_view stream_view(stream); | ||
|
||
auto input_string = make_test_json_data(state); | ||
auto& d_input = static_cast<cudf::scalar_type_t<std::string>&>(*input_string); | ||
|
||
state.add_element_count(d_input.size()); | ||
|
||
// Prepare input & output buffers | ||
hostdevice_vector<SymbolT> output_gpu(d_input.size(), stream_view); | ||
hostdevice_vector<SymbolOffsetT> output_gpu_size(single_item, stream_view); | ||
hostdevice_vector<SymbolOffsetT> out_indexes_gpu(d_input.size(), stream_view); | ||
|
||
// Run algorithm | ||
DfaFstT parser{pda_sgs, pda_state_tt, pda_out_tt, stream.value()}; | ||
|
||
state.set_cuda_stream(nvbench::make_cuda_stream_view(stream.value())); | ||
state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { | ||
// Allocate device-side temporary storage & run algorithm | ||
parser.Transduce(d_input.data(), | ||
static_cast<SymbolOffsetT>(d_input.size()), | ||
output_gpu.device_ptr(), | ||
thrust::make_discard_iterator(), | ||
output_gpu_size.device_ptr(), | ||
start_state, | ||
stream.value()); | ||
}); | ||
} | ||
|
||
void BM_FST_JSON_no_out(nvbench::state& state) | ||
{ | ||
// TODO: to be replaced by nvbench fixture once it's ready | ||
cudf::rmm_pool_raii rmm_pool; | ||
|
||
auto const string_size{size_type(state.get_int64("string_size"))}; | ||
// Prepare cuda stream for data transfers & kernels | ||
rmm::cuda_stream stream{}; | ||
rmm::cuda_stream_view stream_view(stream); | ||
|
||
auto input_string = make_test_json_data(state); | ||
auto& d_input = static_cast<cudf::scalar_type_t<std::string>&>(*input_string); | ||
|
||
state.add_element_count(d_input.size()); | ||
|
||
// Prepare input & output buffers | ||
hostdevice_vector<SymbolOffsetT> output_gpu_size(single_item, stream_view); | ||
|
||
// Run algorithm | ||
DfaFstT parser{pda_sgs, pda_state_tt, pda_out_tt, stream.value()}; | ||
|
||
state.set_cuda_stream(nvbench::make_cuda_stream_view(stream.value())); | ||
state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { | ||
// Allocate device-side temporary storage & run algorithm | ||
parser.Transduce(d_input.data(), | ||
static_cast<SymbolOffsetT>(d_input.size()), | ||
thrust::make_discard_iterator(), | ||
thrust::make_discard_iterator(), | ||
output_gpu_size.device_ptr(), | ||
start_state, | ||
stream.value()); | ||
}); | ||
} | ||
|
||
void BM_FST_JSON_no_str(nvbench::state& state) | ||
{ | ||
// TODO: to be replaced by nvbench fixture once it's ready | ||
cudf::rmm_pool_raii rmm_pool; | ||
|
||
auto const string_size{size_type(state.get_int64("string_size"))}; | ||
// Prepare cuda stream for data transfers & kernels | ||
rmm::cuda_stream stream{}; | ||
rmm::cuda_stream_view stream_view(stream); | ||
|
||
auto input_string = make_test_json_data(state); | ||
auto& d_input = static_cast<cudf::scalar_type_t<std::string>&>(*input_string); | ||
|
||
state.add_element_count(d_input.size()); | ||
|
||
// Prepare input & output buffers | ||
hostdevice_vector<SymbolOffsetT> output_gpu_size(single_item, stream_view); | ||
hostdevice_vector<SymbolOffsetT> out_indexes_gpu(d_input.size(), stream_view); | ||
|
||
// Run algorithm | ||
DfaFstT parser{pda_sgs, pda_state_tt, pda_out_tt, stream.value()}; | ||
|
||
state.set_cuda_stream(nvbench::make_cuda_stream_view(stream.value())); | ||
state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { | ||
// Allocate device-side temporary storage & run algorithm | ||
parser.Transduce(d_input.data(), | ||
static_cast<SymbolOffsetT>(d_input.size()), | ||
thrust::make_discard_iterator(), | ||
out_indexes_gpu.device_ptr(), | ||
output_gpu_size.device_ptr(), | ||
start_state, | ||
stream.value()); | ||
}); | ||
} | ||
|
||
NVBENCH_BENCH(BM_FST_JSON) | ||
.set_name("FST_JSON") | ||
.add_int64_power_of_two_axis("string_size", nvbench::range(20, 31, 1)); | ||
|
||
NVBENCH_BENCH(BM_FST_JSON_no_outidx) | ||
.set_name("FST_JSON_no_outidx") | ||
.add_int64_power_of_two_axis("string_size", nvbench::range(20, 31, 1)); | ||
|
||
NVBENCH_BENCH(BM_FST_JSON_no_out) | ||
.set_name("FST_JSON_no_out") | ||
.add_int64_power_of_two_axis("string_size", nvbench::range(20, 31, 1)); | ||
|
||
NVBENCH_BENCH(BM_FST_JSON_no_str) | ||
.set_name("FST_JSON_no_str") | ||
.add_int64_power_of_two_axis("string_size", nvbench::range(20, 31, 1)); | ||
|
||
} // namespace cudf |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,74 @@ | ||
/* | ||
* Copyright (c) 2022, NVIDIA CORPORATION. | ||
* | ||
* Licensed under the Apache License, Version 2.0 (the "License"); | ||
* you may not use this file except in compliance with the License. | ||
* You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, software | ||
* distributed under the License is distributed on an "AS IS" BASIS, | ||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
* See the License for the specific language governing permissions and | ||
* limitations under the License. | ||
*/ | ||
|
||
#pragma once | ||
|
||
#include <string> | ||
#include <vector> | ||
|
||
namespace cudf::test::io::json { | ||
//------------------------------------------------------------------------------ | ||
// TEST FST SPECIFICATIONS | ||
//------------------------------------------------------------------------------ | ||
// FST to check for brackets and braces outside of pairs of quotes | ||
enum DFA_STATES : char { | ||
// The state being active while being outside of a string. When encountering an opening bracket or | ||
// curly brace, we push it onto the stack. When encountering a closing bracket or brace, we pop it | ||
// from the stack. | ||
TT_OOS = 0U, | ||
// The state being active while being within a string (e.g., field name or a string value). We do | ||
// not push or pop from the stack while being in this state. | ||
TT_STR, | ||
// The state being active after encountering an escape symbol (e.g., '\') while being in the | ||
// TT_STR state. | ||
TT_ESC, | ||
// Total number of states | ||
TT_NUM_STATES | ||
}; | ||
|
||
// Definition of the symbol groups | ||
enum PDA_SG_ID { | ||
OBC = 0U, ///< Opening brace SG: { | ||
OBT, ///< Opening bracket SG: [ | ||
CBC, ///< Closing brace SG: } | ||
CBT, ///< Closing bracket SG: ] | ||
QTE, ///< Quote character SG: " | ||
ESC, ///< Escape character SG: '\' | ||
OTR, ///< SG implicitly matching all other characters | ||
NUM_SYMBOL_GROUPS ///< Total number of symbol groups | ||
}; | ||
|
||
// Transition table | ||
const std::vector<std::vector<DFA_STATES>> pda_state_tt = { | ||
/* IN_STATE { [ } ] " \ OTHER */ | ||
/* TT_OOS */ {TT_OOS, TT_OOS, TT_OOS, TT_OOS, TT_STR, TT_OOS, TT_OOS}, | ||
/* TT_STR */ {TT_STR, TT_STR, TT_STR, TT_STR, TT_OOS, TT_ESC, TT_STR}, | ||
/* TT_ESC */ {TT_STR, TT_STR, TT_STR, TT_STR, TT_STR, TT_STR, TT_STR}}; | ||
|
||
// Translation table (i.e., for each transition, what are the symbols that we output) | ||
const std::vector<std::vector<std::vector<char>>> pda_out_tt = { | ||
/* IN_STATE { [ } ] " \ OTHER */ | ||
/* TT_OOS */ {{'{'}, {'['}, {'}'}, {']'}, {'x'}, {'x'}, {'x'}}, | ||
/* TT_STR */ {{'x'}, {'x'}, {'x'}, {'x'}, {'x'}, {'x'}, {'x'}}, | ||
/* TT_ESC */ {{'x'}, {'x'}, {'x'}, {'x'}, {'x'}, {'x'}, {'x'}}}; | ||
|
||
// The i-th string representing all the characters of a symbol group | ||
const std::vector<std::string> pda_sgs = {"{", "[", "}", "]", "\"", "\\"}; | ||
|
||
// The DFA's starting state | ||
constexpr DFA_STATES start_state = TT_OOS; | ||
|
||
} // namespace cudf::test::io::json |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters