Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

POC for whitespace removal in input JSON data using FST #14931

Merged
merged 17 commits into from
Feb 8, 2024
Merged
Show file tree
Hide file tree
Changes from 11 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions cpp/tests/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -315,6 +315,7 @@ ConfigureTest(NESTED_JSON_TEST io/nested_json_test.cpp io/json_tree.cpp)
ConfigureTest(ARROW_IO_SOURCE_TEST io/arrow_io_source_test.cpp)
ConfigureTest(MULTIBYTE_SPLIT_TEST io/text/multibyte_split_test.cpp)
ConfigureTest(JSON_QUOTE_NORMALIZATION io/json_quote_normalization_test.cpp)
ConfigureTest(JSON_WHITESPACE_NORMALIZATION io/json_whitespace_normalization_test.cu)
ConfigureTest(
DATA_CHUNK_SOURCE_TEST io/text/data_chunk_source_test.cpp
GPUS 1
Expand Down
243 changes: 243 additions & 0 deletions cpp/tests/io/json_whitespace_normalization_test.cu
shrshi marked this conversation as resolved.
Show resolved Hide resolved
Original file line number Diff line number Diff line change
@@ -0,0 +1,243 @@
/*
* Copyright (c) 2024, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include <io/fst/lookup_tables.cuh>
#include <io/utilities/hostdevice_vector.hpp>

#include <cudf/scalar/scalar_factories.hpp>
#include <cudf/types.hpp>

#include <cudf_test/base_fixture.hpp>
#include <cudf_test/cudf_gtest.hpp>
#include <cudf_test/testing_main.hpp>

#include <rmm/cuda_stream.hpp>
#include <rmm/cuda_stream_view.hpp>

#include <thrust/iterator/discard_iterator.h>

#include <cstdlib>
#include <string>

namespace {
// Type used to represent the atomic symbol type used within the finite-state machine
using SymbolT = char;
using StateT = char;

// Type sufficiently large to index symbols within the input and output (may be unsigned)
using SymbolOffsetT = uint32_t;

enum class dfa_states : StateT { TT_OOS = 0U, TT_DQS, TT_DEC, TT_WS, TT_NUM_STATES };
shrshi marked this conversation as resolved.
Show resolved Hide resolved
enum class dfa_symbol_group_id : uint32_t {
DOUBLE_QUOTE_CHAR, ///< Quote character SG: "
ESCAPE_CHAR, ///< Escape character SG: '\\'
NEWLINE_CHAR, ///< Newline character SG: '\n'
WHITESPACE_SYMBOLS, ///< Whitespace characters SG: '\t' or ' '
OTHER_SYMBOLS, ///< SG implicitly matching all other characters
NUM_SYMBOL_GROUPS ///< Total number of symbol groups
};

// Aliases for readability of the transition table
constexpr auto TT_OOS = dfa_states::TT_OOS;
constexpr auto TT_DQS = dfa_states::TT_DQS;
constexpr auto TT_DEC = dfa_states::TT_DEC;
constexpr auto TT_WS = dfa_states::TT_WS;
constexpr auto TT_NUM_STATES = static_cast<StateT>(dfa_states::TT_NUM_STATES);
constexpr auto NUM_SYMBOL_GROUPS = static_cast<uint32_t>(dfa_symbol_group_id::NUM_SYMBOL_GROUPS);

// The i-th string representing all the characters of a symbol group
std::array<std::vector<SymbolT>, NUM_SYMBOL_GROUPS - 1> const wna_sgs{
vuule marked this conversation as resolved.
Show resolved Hide resolved
{{'"'}, {'\\'}, {'\n'}, {' ', '\t'}}};
shrshi marked this conversation as resolved.
Show resolved Hide resolved
shrshi marked this conversation as resolved.
Show resolved Hide resolved

// Transition table
std::array<std::array<dfa_states, NUM_SYMBOL_GROUPS>, TT_NUM_STATES> const wna_state_tt{
{/* IN_STATE " \ \n <SPC> OTHER */
/* TT_OOS */ {{TT_DQS, TT_OOS, TT_OOS, TT_WS, TT_OOS}},
/* TT_DQS */ {{TT_OOS, TT_DEC, TT_OOS, TT_DQS, TT_DQS}},
/* TT_DEC */ {{TT_DQS, TT_DQS, TT_OOS, TT_DQS, TT_DQS}},
/* TT_WS */ {{TT_DQS, TT_OOS, TT_OOS, TT_WS, TT_OOS}}}};
shrshi marked this conversation as resolved.
Show resolved Hide resolved

// The DFA's starting state
constexpr StateT start_state = static_cast<StateT>(TT_OOS);

struct TransduceToNormalizedWS {
/**
* @brief Returns the <relative_offset>-th output symbol on the transition (state_id, match_id).
*/
template <typename StateT, typename SymbolGroupT, typename RelativeOffsetT, typename SymbolT>
constexpr CUDF_HOST_DEVICE SymbolT operator()(StateT const state_id,
SymbolGroupT const match_id,
RelativeOffsetT const relative_offset,
SymbolT const read_symbol) const
{
// -------- TRANSLATION TABLE ------------
// Let the alphabet set be Sigma
// ---------------------------------------
// ---------- NON-SPECIAL CASES: ----------
// Output symbol same as input symbol <s>
// state | read_symbol <s> -> output_symbol <s>
// DQS | Sigma -> Sigma
// WS | Sigma\{<SPC>,\t} -> Sigma\{<SPC>,\t}
// OOS | Sigma\{<SPC>,\t} -> Sigma\{<SPC>,\t}
// ---------- SPECIAL CASES: --------------
// Input symbol translates to output symbol
// OOS | {<SPC>} -> <nop>
// OOS | {\t} -> <nop>
// WS | {<SPC>} -> <nop>
// WS | {\t} -> <nop>

// Case when read symbol is a whitespace or tabspace but is unquoted
shrshi marked this conversation as resolved.
Show resolved Hide resolved
// This will be the same condition as in `operator()(state_id, match_id, read_symbol)` function
// However, since there is no output in this case i.e. the count returned by
// operator()(state_id, match_id, read_symbol) is zero, this function is never called.
// So skipping the check for this case.

// In all other cases, we have an output symbol for the input symbol.
// We simply output the input symbol
return read_symbol;
}

/**
* @brief Returns the number of output characters for a given transition.
* During whitespace normalization, we always emit one output character i.e., the input
* character, except when we need to remove the space/tab character
*/
template <typename StateT, typename SymbolGroupT, typename SymbolT>
constexpr CUDF_HOST_DEVICE uint32_t operator()(StateT const state_id,
SymbolGroupT const match_id,
SymbolT const read_symbol) const
{
// Case when read symbol is a whitespace or tabspace but is unquoted
if ((match_id == static_cast<SymbolGroupT>(dfa_symbol_group_id::WHITESPACE_SYMBOLS)) &&
((state_id == static_cast<StateT>(dfa_states::TT_OOS)) ||
(state_id == static_cast<StateT>(dfa_states::TT_WS)))) {
shrshi marked this conversation as resolved.
Show resolved Hide resolved
return 0;
}
return 1;
}
};
} // namespace

// Base test fixture for tests
struct JsonWSNormalizationTest : public cudf::test::BaseFixture {};

void run_test(const std::string& input, const std::string& output)
shrshi marked this conversation as resolved.
Show resolved Hide resolved
{
// Prepare cuda stream for data transfers & kernels
rmm::cuda_stream stream{};
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

should this be cudf::get_default_stream() for tests?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Great idea! It's better to call cudf::test::get_default_stream() here instead of creating a new stream. Fixed.

rmm::cuda_stream_view stream_view(stream);

auto parser = cudf::io::fst::detail::make_fst(
cudf::io::fst::detail::make_symbol_group_lut(wna_sgs),
cudf::io::fst::detail::make_transition_table(wna_state_tt),
cudf::io::fst::detail::make_translation_functor(TransduceToNormalizedWS{}),
stream);
shrshi marked this conversation as resolved.
Show resolved Hide resolved

auto d_input_scalar = cudf::make_string_scalar(input, stream_view);
auto& d_input = static_cast<cudf::scalar_type_t<std::string>&>(*d_input_scalar);

// Prepare input & output buffers
constexpr std::size_t single_item = 1;
cudf::detail::hostdevice_vector<SymbolT> output_gpu(input.size() * 2, stream_view);
shrshi marked this conversation as resolved.
Show resolved Hide resolved
cudf::detail::hostdevice_vector<SymbolOffsetT> output_gpu_size(single_item, stream_view);

// Allocate device-side temporary storage & run algorithm
parser.Transduce(d_input.data(),
static_cast<SymbolOffsetT>(d_input.size()),
output_gpu.device_ptr(),
thrust::make_discard_iterator(),
output_gpu_size.device_ptr(),
start_state,
stream_view);

// Async copy results from device to host
output_gpu.device_to_host_async(stream_view);
output_gpu_size.device_to_host_async(stream_view);

// Make sure results have been copied back to host
stream.synchronize();

// Verify results
ASSERT_EQ(output_gpu_size[0], output.size());
shrshi marked this conversation as resolved.
Show resolved Hide resolved
CUDF_TEST_EXPECT_VECTOR_EQUAL(output_gpu, output, output.size());
}

TEST_F(JsonWSNormalizationTest, GroundTruth_Spaces)
{
std::string input = R"({"A" : "TEST" })";
std::string output = R"({"A":"TEST"})";
run_test(input, output);
}

TEST_F(JsonWSNormalizationTest, GroundTruth_MoreSpaces)
{
std::string input = R"({"a": [1, 2, 3, 4, 5, 6, 7, 8], "b": {"c": "d"}})";
std::string output = R"({"a":[1,2,3,4,5,6,7,8],"b":{"c":"d"}})";
run_test(input, output);
}

TEST_F(JsonWSNormalizationTest, GroundTruth_StringSpaces)
{
std::string input = R"({" a ":50})";
std::string output = R"({" a ":50})";
run_test(input, output);
}

TEST_F(JsonWSNormalizationTest, GroundTruth_StillMoreSpaces)
{
std::string input = R"( { "a" : 50 })";
std::string output = R"({"a":50})";
run_test(input, output);
}

TEST_F(JsonWSNormalizationTest, GroundTruth_Tabs)
{
std::string input = "{\"a\":\t\"b\"}";
std::string output = R"({"a":"b"})";
run_test(input, output);
}

TEST_F(JsonWSNormalizationTest, GroundTruth_SpacesAndTabs)
{
std::string input = "{\"A\" : \t\"TEST\" }";
std::string output = R"({"A":"TEST"})";
run_test(input, output);
}

TEST_F(JsonWSNormalizationTest, GroundTruth_MultilineJSONWithSpacesAndTabs)
{
std::string input =
"{ \"foo rapids\": [1,2,3], \"bar\trapids\": 123 }\n\t{ \"foo rapids\": { \"a\": 1 }, "
"\"bar\trapids\": 456 }";
std::string output =
"{\"foo rapids\":[1,2,3],\"bar\trapids\":123}\n{\"foo rapids\":{\"a\":1},\"bar\trapids\":456}";
run_test(input, output);
}

TEST_F(JsonWSNormalizationTest, GroundTruth_PureJSONExample)
{
std::string input = R"([{"a":50}, {"a" : 60}])";
std::string output = R"([{"a":50},{"a":60}])";
run_test(input, output);
}

TEST_F(JsonWSNormalizationTest, GroundTruth_NoNormalizationRequired)
{
std::string input = R"({"a":50})";
std::string output = R"({"a":50})";
run_test(input, output);
}

CUDF_TEST_PROGRAM_MAIN()
Loading