From 0557d4176fca272ac98d644bbb3dd8fa87333d7a Mon Sep 17 00:00:00 2001 From: Elias Stehle <3958403+elstehle@users.noreply.github.com> Date: Mon, 11 Apr 2022 12:17:55 -0700 Subject: [PATCH 01/38] squashed with bracket/brace test --- cpp/src/io/fst/agent_dfa.cuh | 722 +++++++++++++++++++++++++++ cpp/src/io/fst/device_dfa.cuh | 264 ++++++++++ cpp/src/io/fst/dispatch_dfa.cuh | 462 +++++++++++++++++ cpp/src/io/fst/in_reg_array.cuh | 138 +++++ cpp/src/io/fst/symbol_lut.cuh | 182 +++++++ cpp/src/io/fst/transition_table.cuh | 149 ++++++ cpp/src/io/fst/translation_table.cuh | 200 ++++++++ cpp/tests/CMakeLists.txt | 1 + cpp/tests/io/fst/fst_test.cu | 291 +++++++++++ 9 files changed, 2409 insertions(+) create mode 100644 cpp/src/io/fst/agent_dfa.cuh create mode 100644 cpp/src/io/fst/device_dfa.cuh create mode 100644 cpp/src/io/fst/dispatch_dfa.cuh create mode 100644 cpp/src/io/fst/in_reg_array.cuh create mode 100644 cpp/src/io/fst/symbol_lut.cuh create mode 100644 cpp/src/io/fst/transition_table.cuh create mode 100644 cpp/src/io/fst/translation_table.cuh create mode 100644 cpp/tests/io/fst/fst_test.cu diff --git a/cpp/src/io/fst/agent_dfa.cuh b/cpp/src/io/fst/agent_dfa.cuh new file mode 100644 index 00000000000..d983f9287a9 --- /dev/null +++ b/cpp/src/io/fst/agent_dfa.cuh @@ -0,0 +1,722 @@ +/* + * Copyright (c) 2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include "in_reg_array.cuh" + +#include + +#include + +namespace cudf { +namespace io { +namespace fst { +namespace detail { + +//----------------------------------------------------------------------------- +// STATE VECTOR +//----------------------------------------------------------------------------- +/** + * @brief A vector is able to hold multiple state indices (e.g., to represent multiple DFA + * instances, where the i-th item would represent the i-th DFA instance). + * + * @tparam StateIndexT Signed or unsigned type used to index items inside the vector + * @tparam NUM_ITEMS The number of items to be allocated for a vector + */ +template +class MultiItemStateVector { + public: + template + constexpr CUDF_HOST_DEVICE void Set(IndexT index, StateIndexT value) noexcept + { + state_[index] = value; + } + + template + constexpr CUDF_HOST_DEVICE StateIndexT Get(IndexT index) const noexcept + { + return state_[index]; + } + + private: + StateIndexT state_[NUM_ITEMS]; +}; + +//----------------------------------------------------------------------------- +// DFA-SIMULATION STATE COMPOSITION FUNCTORS +//----------------------------------------------------------------------------- +/** + * @brief Implements an associative composition operation for state transition vectors and + * offset-to-overap vectors to be used with a prefix scan. + * + * l r = c ( s->l->r) + * 0: [2] [1] [2] (i.e. 0->2->2) + * 1: [1] [2] [2] (i.e. 1->1->2) + * 2: [0] [2] [1] (i.e. 2->0->2) + * @tparam NUM_ITEMS The number of items stored within a vector + */ +template +struct VectorCompositeOp { + template + constexpr CUDF_HOST_DEVICE VectorT operator()(VectorT const& lhs, VectorT const& rhs) + { + VectorT res; + for (int32_t i = 0; i < NUM_ITEMS; ++i) { + res.Set(i, rhs.Get(lhs.Get(i))); + } + return res; + } +}; + +//----------------------------------------------------------------------------- +// DFA-SIMULATION CALLBACK WRAPPERS/HELPERS +//----------------------------------------------------------------------------- +template +class DFASimulationCallbackWrapper { + public: + __host__ __device__ __forceinline__ DFASimulationCallbackWrapper( + TransducerTableT transducer_table, TransducedOutItT out_it, TransducedIndexOutItT out_idx_it) + : transducer_table(transducer_table), out_it(out_it), out_idx_it(out_idx_it), write(false) + { + } + + template + __host__ __device__ __forceinline__ void Init(OffsetT const& offset) + { + this->offset = offset; + if (!write) out_count = 0; + } + + template + __host__ __device__ __forceinline__ void ReadSymbol(CharIndexT const& character_index, + StateVectorT const& old_state, + StateVectorT const& new_state, + SymbolIndexT const& symbol_id) + { + uint32_t count = transducer_table(old_state.Get(0), symbol_id); + if (write) { + for (uint32_t out_char = 0; out_char < count; out_char++) { + out_it[out_count + out_char] = + transducer_table(old_state.Get(0), symbol_id, out_char); + out_idx_it[out_count + out_char] = offset + character_index; + } + } + out_count += count; + } + + __host__ __device__ __forceinline__ void TearDown() {} + + public: + TransducerTableT transducer_table; + TransducedOutItT out_it; + TransducedIndexOutItT out_idx_it; + uint32_t out_count; + uint32_t offset; + bool write; +}; + +//----------------------------------------------------------------------------- +// STATE-TRANSITION CALLBACKS +//----------------------------------------------------------------------------- +class StateTransitionCallbackOp { + public: + template + __host__ __device__ __forceinline__ void ReadSymbol(CharIndexT const& character_index, + SymbolIndexT const& read_symbol_id) const + { + } +}; +/// Type alias for a state transition callback class that performs no operation on any callback +using NoOpStateTransitionOp = StateTransitionCallbackOp; + +template +class StateVectorTransitionOp : public StateTransitionCallbackOp { + public: + __host__ __device__ __forceinline__ + StateVectorTransitionOp(TransitionTableT const& transition_table, StateVectorT& state_vector) + : transition_table(transition_table), state_vector(state_vector) + { + } + + template + __host__ __device__ __forceinline__ void ReadSymbol(CharIndexT const& character_index, + SymbolIndexT const read_symbol_id) const + { + using TransitionVectorT = typename TransitionTableT::TransitionVectorT; + + for (int32_t i = 0; i < NUM_INSTANCES; ++i) { + state_vector.Set(i, transition_table(state_vector.Get(i), read_symbol_id)); + } + } + + public: + StateVectorT& state_vector; + const TransitionTableT& transition_table; +}; + +template +struct StateTransitionOp { + StateVectorT old_state_vector; + StateVectorT state_vector; + const TransitionTableT& transition_table; + CallbackOpT& callback_op; + + __host__ __device__ __forceinline__ StateTransitionOp(const TransitionTableT& transition_table, + StateVectorT state_vector, + CallbackOpT& callback_op) + : transition_table(transition_table), + state_vector(state_vector), + old_state_vector(state_vector), + callback_op(callback_op) + { + } + + template + __host__ __device__ __forceinline__ void ReadSymbol(const CharIndexT& character_index, + const SymbolIndexT& read_symbol_id) + { + using TransitionVectorT= typename TransitionTableT::TransitionVectorT ; + old_state_vector = state_vector; + state_vector.Set(0, transition_table(state_vector.Get(0), read_symbol_id)); + callback_op.ReadSymbol(character_index, old_state_vector, state_vector, read_symbol_id); + } +}; + +template +struct AgentDFA { + using SymbolIndexT = uint32_t; + using StateIndexT = uint32_t; + using AliasedLoadT = uint32_t; + using CharT = typename std::iterator_traits::value_type; + + //------------------------------------------------------------------------------ + // DERIVED CONFIGS + //------------------------------------------------------------------------------ + static constexpr uint32_t BLOCK_THREADS = AgentDFAPolicy::BLOCK_THREADS; + static constexpr uint32_t ITEMS_PER_THREAD = AgentDFAPolicy::ITEMS_PER_THREAD; + + // The number of symbols per thread + static constexpr uint32_t SYMBOLS_PER_THREAD = ITEMS_PER_THREAD; + static constexpr uint32_t SYMBOLS_PER_BLOCK = BLOCK_THREADS * SYMBOLS_PER_THREAD; + + static constexpr uint32_t MIN_UINTS_PER_BLOCK = + CUB_QUOTIENT_CEILING(SYMBOLS_PER_BLOCK, sizeof(AliasedLoadT)); + static constexpr uint32_t UINTS_PER_THREAD = + CUB_QUOTIENT_CEILING(MIN_UINTS_PER_BLOCK, BLOCK_THREADS); + static constexpr uint32_t UINTS_PER_BLOCK = UINTS_PER_THREAD * BLOCK_THREADS; + static constexpr uint32_t SYMBOLS_PER_UINT_BLOCK = UINTS_PER_BLOCK * sizeof(AliasedLoadT); + + //------------------------------------------------------------------------------ + // TYPEDEFS + //------------------------------------------------------------------------------ + struct _TempStorage { + // For aliased loading of characters into shared memory + union { + CharT chars[SYMBOLS_PER_BLOCK]; + AliasedLoadT uints[UINTS_PER_BLOCK]; + }; + }; + + struct TempStorage : cub::Uninitialized<_TempStorage> { + }; + + //------------------------------------------------------------------------------ + // MEMBER VARIABLES + //------------------------------------------------------------------------------ + _TempStorage& temp_storage; + + //------------------------------------------------------------------------------ + // CONSTRUCTOR + //------------------------------------------------------------------------------ + __device__ __forceinline__ AgentDFA(TempStorage& temp_storage) + : temp_storage(temp_storage.Alias()) + { + } + + //--------------------------------------------------------------------- + // STATIC PARSING PRIMITIVES + //--------------------------------------------------------------------- + template + __device__ __forceinline__ static void ThreadParse(const SymbolMatcherT& symbol_matcher, + const CharT* chars, + const SymbolIndexT& max_num_chars, + CallbackOpT callback_op, + cub::Int2Type /*IS_FULL_BLOCK*/) + { + uint32_t matched_id; + + // Iterate over symbols +#pragma unroll + for (int32_t i = 0; i < NUM_SYMBOLS; ++i) { + if (IS_FULL_BLOCK || threadIdx.x * SYMBOLS_PER_THREAD + i < max_num_chars) { + matched_id = symbol_matcher(chars[i]); + callback_op.ReadSymbol(i, matched_id); + } + } + } + + template + __device__ __forceinline__ void GetThreadStateTransitions( + const SymbolMatcherT& symbol_matcher, + const CharT* chars, + const SymbolIndexT& max_num_chars, + StateTransitionOpT& state_transition_op, + cub::Int2Type /*IS_FULL_BLOCK*/) + { + ThreadParse( + symbol_matcher, chars, max_num_chars, state_transition_op, cub::Int2Type()); + } + + //--------------------------------------------------------------------- + // LOADING FULL BLOCK OF CHARACTERS, NON-ALIASED + //--------------------------------------------------------------------- + __device__ __forceinline__ void LoadBlock(const CharT* d_chars, + const OffsetT block_offset, + const OffsetT num_total_symbols, + cub::Int2Type /*IS_FULL_BLOCK*/, + cub::Int2Type<1> /*ALIGNMENT*/) + { + CharT thread_chars[SYMBOLS_PER_THREAD]; + + const CharT* d_block_symbols = d_chars + block_offset; + cub::LoadDirectStriped(threadIdx.x, d_block_symbols, thread_chars); + +#pragma unroll + for (int32_t i = 0; i < SYMBOLS_PER_THREAD; ++i) { + temp_storage.chars[threadIdx.x + i * BLOCK_THREADS] = thread_chars[i]; + } + } + + //--------------------------------------------------------------------- + // LOADING PARTIAL BLOCK OF CHARACTERS, NON-ALIASED + //--------------------------------------------------------------------- + __device__ __forceinline__ void LoadBlock(const CharT* d_chars, + const OffsetT block_offset, + const OffsetT num_total_symbols, + cub::Int2Type /*IS_FULL_BLOCK*/, + cub::Int2Type<1> /*ALIGNMENT*/) + { + CharT thread_chars[SYMBOLS_PER_THREAD]; + + if (num_total_symbols <= block_offset) return; + + // Last unit to be loaded is IDIV_CEIL(#SYM, SYMBOLS_PER_UNIT) + OffsetT num_total_chars = num_total_symbols - block_offset; + + const CharT* d_block_symbols = d_chars + block_offset; + cub::LoadDirectStriped( + threadIdx.x, d_block_symbols, thread_chars, num_total_chars); + +#pragma unroll + for (int32_t i = 0; i < SYMBOLS_PER_THREAD; ++i) { + temp_storage.chars[threadIdx.x + i * BLOCK_THREADS] = thread_chars[i]; + } + } + + //--------------------------------------------------------------------- + // LOADING FULL BLOCK OF CHARACTERS, ALIASED + //--------------------------------------------------------------------- + __device__ __forceinline__ void LoadBlock(const CharT* d_chars, + const OffsetT block_offset, + const OffsetT num_total_symbols, + cub::Int2Type /*IS_FULL_BLOCK*/, + cub::Int2Type /*ALIGNMENT*/) + { + AliasedLoadT thread_units[UINTS_PER_THREAD]; + + const AliasedLoadT* d_block_symbols = reinterpret_cast(d_chars + block_offset); + cub::LoadDirectStriped(threadIdx.x, d_block_symbols, thread_units); + +#pragma unroll + for (int32_t i = 0; i < UINTS_PER_THREAD; ++i) { + temp_storage.uints[threadIdx.x + i * BLOCK_THREADS] = thread_units[i]; + } + } + + //--------------------------------------------------------------------- + // LOADING PARTIAL BLOCK OF CHARACTERS, ALIASED + //--------------------------------------------------------------------- + __device__ __forceinline__ void LoadBlock(const CharT* d_chars, + const OffsetT block_offset, + const OffsetT num_total_symbols, + cub::Int2Type /*IS_FULL_BLOCK*/, + cub::Int2Type /*ALIGNMENT*/) + { + AliasedLoadT thread_units[UINTS_PER_THREAD]; + + if (num_total_symbols <= block_offset) return; + + // Last unit to be loaded is IDIV_CEIL(#SYM, SYMBOLS_PER_UNIT) + OffsetT num_total_units = + CUB_QUOTIENT_CEILING(num_total_symbols - block_offset, sizeof(AliasedLoadT)); + + const AliasedLoadT* d_block_symbols = reinterpret_cast(d_chars + block_offset); + cub::LoadDirectStriped( + threadIdx.x, d_block_symbols, thread_units, num_total_units); + +#pragma unroll + for (int32_t i = 0; i < UINTS_PER_THREAD; ++i) { + temp_storage.uints[threadIdx.x + i * BLOCK_THREADS] = thread_units[i]; + } + } + + //--------------------------------------------------------------------- + // LOADING BLOCK OF CHARACTERS: DISPATCHER + //--------------------------------------------------------------------- + __device__ __forceinline__ void LoadBlock(const CharT* d_chars, + const OffsetT block_offset, + const OffsetT num_total_symbols) + { + // Check if pointer is aligned to four bytes + if (((uintptr_t)(const void*)(d_chars + block_offset) % 4) == 0) { + if (block_offset + SYMBOLS_PER_UINT_BLOCK < num_total_symbols) { + LoadBlock( + d_chars, block_offset, num_total_symbols, cub::Int2Type(), cub::Int2Type<4>()); + } else { + LoadBlock( + d_chars, block_offset, num_total_symbols, cub::Int2Type(), cub::Int2Type<1>()); + } + } else { + if (block_offset + SYMBOLS_PER_UINT_BLOCK < num_total_symbols) { + LoadBlock( + d_chars, block_offset, num_total_symbols, cub::Int2Type(), cub::Int2Type<1>()); + } else { + LoadBlock( + d_chars, block_offset, num_total_symbols, cub::Int2Type(), cub::Int2Type<1>()); + } + } + } + + template + __device__ __forceinline__ void GetThreadStateTransitionVector( + const SymbolMatcherT& symbol_matcher, + const TransitionTableT& transition_table, + const CharT* d_chars, + const OffsetT block_offset, + const OffsetT num_total_symbols, + StateVectorT& state_vector) + { + using StateVectorTransitionOpT = StateVectorTransitionOp; + + // Start parsing and to transition states + StateVectorTransitionOpT transition_op(transition_table, state_vector); + + // Load characters into shared memory + LoadBlock(d_chars, block_offset, num_total_symbols); + + // If this is a full block (i.e., all threads can parse all their symbols) + OffsetT num_block_chars = num_total_symbols - block_offset; + bool is_full_block = (num_block_chars >= SYMBOLS_PER_BLOCK); + + // Ensure characters have been loaded + __syncthreads(); + + // Thread's symbols + CharT* t_chars = &temp_storage.chars[threadIdx.x * SYMBOLS_PER_THREAD]; + + // Parse thread's symbols and transition the state-vector + if (is_full_block) { + GetThreadStateTransitions( + symbol_matcher, t_chars, num_block_chars, transition_op, cub::Int2Type()); + } else { + GetThreadStateTransitions( + symbol_matcher, t_chars, num_block_chars, transition_op, cub::Int2Type()); + } + + // transition_op.TearDown(); + } + + template + __device__ __forceinline__ void GetThreadStateTransitions( + SymbolMatcherT const& symbol_matcher, + TransitionTableT const& transition_table, + CharT const* d_chars, + OffsetT const block_offset, + OffsetT const num_total_symbols, + StateVectorT& state_vector, + CallbackOpT& callback_op, + cub::Int2Type /**/) + { + using StateTransitionOpT = StateTransitionOp; + + // Start parsing and to transition states + StateTransitionOpT transition_op(transition_table, state_vector, callback_op); + + // Load characters into shared memory + if (!BYPASS_LOAD) LoadBlock(d_chars, block_offset, num_total_symbols); + + // If this is a full block (i.e., all threads can parse all their symbols) + OffsetT num_block_chars = num_total_symbols - block_offset; + bool is_full_block = (num_block_chars >= SYMBOLS_PER_BLOCK); + + // Ensure characters have been loaded + __syncthreads(); + + // Thread's symbols + CharT* t_chars = &temp_storage.chars[threadIdx.x * SYMBOLS_PER_THREAD]; + + // Initialize callback + callback_op.Init(block_offset + threadIdx.x * SYMBOLS_PER_THREAD); + + // Parse thread's symbols and transition the state-vector + if (is_full_block) { + GetThreadStateTransitions( + symbol_matcher, t_chars, num_block_chars, transition_op, cub::Int2Type()); + } else { + GetThreadStateTransitions( + symbol_matcher, t_chars, num_block_chars, transition_op, cub::Int2Type()); + } + + callback_op.TearDown(); + } +}; + +template +__launch_bounds__(int32_t(AgentDFAPolicy::BLOCK_THREADS)) __global__ + void SimulateDFAKernel(DfaT dfa, + SymbolItT d_chars, + OffsetT const num_chars, + uint32_t seed_state, + StateVectorT* __restrict__ d_thread_state_transition, + TileStateT tile_state, + OutOffsetScanTileState offset_tile_state, + TransducedOutItT transduced_out_it, + TransducedIndexOutItT transduced_out_idx_it, + TransducedCountOutItT d_num_transduced_out_it) +{ + using StateIndexT = uint32_t; + + using AgentDfaSimT = AgentDFA; + + static constexpr uint32_t NUM_STATES = DfaT::MAX_NUM_STATES; + + enum { + BLOCK_THREADS = AgentDFAPolicy::BLOCK_THREADS, + ITEMS_PER_THREAD = AgentDFAPolicy::ITEMS_PER_THREAD, + SYMBOLS_PER_BLOCK = AgentDfaSimT::SYMBOLS_PER_BLOCK + }; + + // Shared memory required by the DFA simulator + __shared__ typename AgentDfaSimT::TempStorage dfa_storage; + + // Shared memory required by the symbol group lookup table + __shared__ typename DfaT::SymbolGroupStorageT symbol_matcher_storage; + + // Shared memory required by the transition table + __shared__ typename DfaT::TransitionTableStorageT transition_table_storage; + + // Shared memory required by the transducer table + __shared__ typename DfaT::TranslationTableStorageT transducer_table_storage; + + // Initialize symbol group lookup table + auto symbol_matcher = dfa.InitSymbolGroupLUT(symbol_matcher_storage); + + // Initialize transition table + auto transition_table = dfa.InitTransitionTable(transition_table_storage); + + // Initialize transition table + auto transducer_table = dfa.InitTranslationTable(transducer_table_storage); + + // Set up DFA + AgentDfaSimT agent_dfa(dfa_storage); + + // Memory is the state transition vector passed on to the second stage of the algorithm + StateVectorT out_state_vector; + + // Stage 1: Compute the state-transition vector + if (IS_TRANS_VECTOR_PASS || IS_SINGLE_PASS) { + // StateVectorT state_vector; + MultiItemStateVector state_vector; + + // Initialize the seed state transition vector with the identity vector +#pragma unroll + for (int32_t i = 0; i < NUM_STATES; ++i) { + state_vector.Set(i, i); + } + + // Compute the state transition vector + agent_dfa.GetThreadStateTransitionVector(symbol_matcher, + transition_table, + d_chars, + blockIdx.x * SYMBOLS_PER_BLOCK, + num_chars, + state_vector); + + // Initialize the state transition vector passed on to the second stage +#pragma unroll + for (int32_t i = 0; i < NUM_STATES; ++i) { + out_state_vector.Set(i, state_vector.Get(i)); + } + + // Write out state-transition vector + if (!IS_SINGLE_PASS) { + d_thread_state_transition[blockIdx.x * BLOCK_THREADS + threadIdx.x] = out_state_vector; + } + } + // Stage 2: Perform FSM simulation + if ((!IS_TRANS_VECTOR_PASS) || IS_SINGLE_PASS) { + constexpr uint32_t SINGLE_ITEM_COUNT = 1; + MultiItemStateVector state; + + //------------------------------------------------------------------------------ + // SINGLE-PASS: + // -> block-wide inclusive prefix scan on the state transition vector + // -> first block/tile: write out block aggregate as the "tile's" inclusive (i.e., the one that + // incorporates all preceding blocks/tiles results) + //------------------------------------------------------------------------------ + if (IS_SINGLE_PASS) { + uint32_t tile_idx = blockIdx.x; + using StateVectorCompositeOpT = VectorCompositeOp; + + using PrefixCallbackOpT_ = + cub::TilePrefixCallbackOp; + + using ItemsBlockScan = + cub::BlockScan; + + __shared__ typename ItemsBlockScan::TempStorage scan_temp_storage; + __shared__ typename PrefixCallbackOpT_::TempStorage prefix_callback_temp_storage; + + // STATE-TRANSITION IDENTITY VECTOR + StateVectorT state_identity_vector; + for (int32_t i = 0; i < NUM_STATES; ++i) { + state_identity_vector.Set(i, i); + } + StateVectorCompositeOpT state_vector_scan_op; + + // + if (tile_idx == 0) { + StateVectorT block_aggregate; + ItemsBlockScan(scan_temp_storage) + .ExclusiveScan(out_state_vector, + out_state_vector, + state_identity_vector, + state_vector_scan_op, + block_aggregate); + + if (threadIdx.x == 0 /*and not IS_LAST_TILE*/) { + tile_state.SetInclusive(0, block_aggregate); + } + } else { + auto prefix_op = PrefixCallbackOpT_( + tile_state, prefix_callback_temp_storage, state_vector_scan_op, tile_idx); + + ItemsBlockScan(scan_temp_storage) + .ExclusiveScan(out_state_vector, out_state_vector, state_vector_scan_op, prefix_op); + } + __syncthreads(); + state.Set(0, out_state_vector.Get(seed_state)); + } else { + state.Set( + 0, d_thread_state_transition[blockIdx.x * BLOCK_THREADS + threadIdx.x].Get(seed_state)); + } + + // Perform finite-state machine simulation, computing size of transduced output + DFASimulationCallbackWrapper + callback_wrapper(transducer_table, transduced_out_it, transduced_out_idx_it); + + MultiItemStateVector t_start_state; + t_start_state.Set(0, state.Get(seed_state)); + agent_dfa.GetThreadStateTransitions(symbol_matcher, + transition_table, + d_chars, + blockIdx.x * SYMBOLS_PER_BLOCK, + num_chars, + state, + callback_wrapper, + cub::Int2Type()); + + __syncthreads(); + using OffsetPrefixScanCallbackOpT_ = + cub::TilePrefixCallbackOp; + + using OutOffsetBlockScan = + cub::BlockScan; + + __shared__ typename OutOffsetBlockScan::TempStorage scan_temp_storage; + __shared__ typename OffsetPrefixScanCallbackOpT_::TempStorage prefix_callback_temp_storage; + + uint32_t tile_idx = blockIdx.x; + if (tile_idx == 0) { + OffsetT block_aggregate = 0; + OutOffsetBlockScan(scan_temp_storage) + .ExclusiveScan(callback_wrapper.out_count, + callback_wrapper.out_count, + static_cast(0), + cub::Sum{}, + block_aggregate); + + if (threadIdx.x == 0 /*and not IS_LAST_TILE*/) { + offset_tile_state.SetInclusive(0, block_aggregate); + } + + if (tile_idx == gridDim.x - 1 && threadIdx.x == 0) { + *d_num_transduced_out_it = block_aggregate; + } + } else { + auto prefix_op = OffsetPrefixScanCallbackOpT_( + offset_tile_state, prefix_callback_temp_storage, cub::Sum{}, tile_idx); + + OutOffsetBlockScan(scan_temp_storage) + .ExclusiveScan( + callback_wrapper.out_count, callback_wrapper.out_count, cub::Sum{}, prefix_op); + + if (tile_idx == gridDim.x - 1 && threadIdx.x == 0) { + *d_num_transduced_out_it = prefix_op.GetInclusivePrefix(); + } + } + + callback_wrapper.write = true; + agent_dfa.GetThreadStateTransitions(symbol_matcher, + transition_table, + d_chars, + blockIdx.x * SYMBOLS_PER_BLOCK, + num_chars, + t_start_state, + callback_wrapper, + cub::Int2Type()); + } +} + +} // namespace detail +} // namespace fst +} // namespace io +} // namespace cudf diff --git a/cpp/src/io/fst/device_dfa.cuh b/cpp/src/io/fst/device_dfa.cuh new file mode 100644 index 00000000000..795c4c98bec --- /dev/null +++ b/cpp/src/io/fst/device_dfa.cuh @@ -0,0 +1,264 @@ +/* + * Copyright (c) 2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include "cub/util_type.cuh" +#include "dispatch_dfa.cuh" +#include +#include +#include + +#include + +namespace cudf { +namespace io { +namespace fst { + +/** + * @brief Uses a deterministic finite automaton to transduce a sequence of symbols from an input + * iterator to a sequence of transduced output symbols. + * + * @tparam SymbolItT Random-access input iterator type to symbols fed into the FST + * @tparam DfaT The DFA specification + * @tparam TransducedOutItT Random-access output iterator to which the transduced output will be + * written + * @tparam TransducedIndexOutItT Random-access output iterator type to which the indexes of the + * symbols that caused some output to be written. + * @tparam TransducedCountOutItT A single-item output iterator type to which the total number of + * output symbols is written + * @tparam OffsetT A type large enough to index into either of both: (a) the input symbols and (b) + * the output symbols + * @param[in] d_temp_storage Device-accessible allocation of temporary storage. When NULL, the + * required allocation size is written to \p temp_storage_bytes and no work is done. + * @param[in,out] temp_storage_bytes Reference to size in bytes of \p d_temp_storage allocation + * @param[in] dfa The DFA specifying the number of distinct symbol groups, transition table, and + * translation table + * @param[in] d_chars_in Random-access input iterator to the beginning of the sequence of input + * symbols + * @param[in] num_chars The total number of input symbols to process + * @param[out] transduced_out_it Random-access output iterator to which the transduced output is + * written + * @param[out] transduced_out_idx_it Random-access output iterator to which, the index i is written + * iff the i-th input symbol caused some output to be written + * @param[out] d_num_transduced_out_it A single-item output iterator type to which the total number + * of output symbols is written + * @param[in] seed_state The DFA's starting state. For streaming DFAs this corresponds to the + * "end-state" of the previous invocation of the algorithm. + * @param[in] stream CUDA stream to launch kernels within. Default is the null-stream. + */ +template +cudaError_t DeviceTransduce(void* d_temp_storage, + size_t& temp_storage_bytes, + DfaT dfa, + SymbolItT d_chars_in, + OffsetT num_chars, + TransducedOutItT transduced_out_it, + TransducedIndexOutItT transduced_out_idx_it, + TransducedCountOutItT d_num_transduced_out_it, + uint32_t seed_state = 0, + cudaStream_t stream = 0) +{ + using DispatchDfaT = detail::DispatchFSM; + + return DispatchDfaT::Dispatch(d_temp_storage, + temp_storage_bytes, + dfa, + seed_state, + d_chars_in, + num_chars, + transduced_out_it, + transduced_out_idx_it, + d_num_transduced_out_it, + stream); +} + +/** + * @brief Helper class to facilitate the specification and instantiation of a DFA (i.e., the + * transition table and its number of states, the mapping of symbols to symbol groups, and the + * translation table that specifies which state transitions cause which output to be written). + * + * @tparam OutSymbolT The symbol type being output by the finite-state transducer + * @tparam NUM_SYMBOLS The number of symbol groups amongst which to differentiate (one dimension of + * the transition table) + * @tparam TT_NUM_STATES The number of states defined by the DFA (the other dimension of the + * transition table) + */ +template +class Dfa { + public: + // The maximum number of states supported by this DFA instance + // This is a value queried by the DFA simulation algorithm + static constexpr int32_t MAX_NUM_STATES = TT_NUM_STATES; + + private: + // Symbol-group id lookup table + using MatcherT = detail::SingleSymbolSmemLUT; + using MatcherInitT = typename MatcherT::KernelParameter; + + // Transition table + using TransitionTableT = detail::TransitionTable; + using TransitionTableInitT = typename TransitionTableT::KernelParameter; + + // Translation lookup table + using OutSymbolOffsetT = uint32_t; + using TransducerTableT = detail::TransducerLookupTable; + using TransducerTableInitT = typename TransducerTableT::KernelParameter; + + // Private members (passed between host/device) + /// Information to initialize the device-side lookup table that maps symbol -> symbol group id + MatcherInitT symbol_matcher_init; + + /// Information to initialize the device-side transition table + TransitionTableInitT tt_init; + + /// Information to initialize the device-side translation table + TransducerTableInitT tt_out_init; + + public: + //--------------------------------------------------------------------- + // DEVICE-SIDE MEMBER FUNCTIONS + //--------------------------------------------------------------------- + using SymbolGroupStorageT = typename MatcherT::TempStorage; + using TransitionTableStorageT = typename TransitionTableT::TempStorage; + using TranslationTableStorageT = typename TransducerTableT::TempStorage; + + __device__ auto InitSymbolGroupLUT(SymbolGroupStorageT& temp_storage) + { + return MatcherT(symbol_matcher_init, temp_storage); + } + + __device__ auto InitTransitionTable(TransitionTableStorageT& temp_storage) + { + return TransitionTableT(tt_init, temp_storage); + } + + __device__ auto InitTranslationTable(TranslationTableStorageT& temp_storage) + { + return TransducerTableT(tt_out_init, temp_storage); + } + + //--------------------------------------------------------------------- + // HOST-SIDE MEMBER FUNCTIONS + //--------------------------------------------------------------------- + template + cudaError_t Init(SymbolGroupIdItT const& symbol_vec, + std::vector> const& tt_vec, + std::vector>> const& out_tt_vec, + cudaStream_t stream = 0) + { + cudaError_t error = cudaSuccess; + + enum : uint32_t { MEM_SYMBOL_MATCHER = 0, MEM_TT, MEM_OUT_TT, NUM_ALLOCATIONS }; + + size_t allocation_sizes[NUM_ALLOCATIONS] = {0}; + void* allocations[NUM_ALLOCATIONS] = {0}; + + // Memory requirements: lookup table + error = MatcherT::PrepareLUT( + nullptr, allocation_sizes[MEM_SYMBOL_MATCHER], symbol_vec, symbol_matcher_init); + if (error) return error; + + // Memory requirements: transition table + error = + TransitionTableT::CreateTransitionTable(nullptr, allocation_sizes[MEM_TT], tt_vec, tt_init); + if (error) return error; + + // Memory requirements: transducer table + error = TransducerTableT::CreateTransitionTable( + nullptr, allocation_sizes[MEM_OUT_TT], out_tt_vec, tt_out_init); + if (error) return error; + + // Memory requirements: total memory + size_t temp_storage_bytes = 0; + error = cub::AliasTemporaries(nullptr, temp_storage_bytes, allocations, allocation_sizes); + if (error) return error; + + // Allocate memory + void* d_temp_storage = nullptr; + error = cudaMalloc(&d_temp_storage, temp_storage_bytes); + if (error) return error; + + // Alias memory + error = + cub::AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes); + if (error) return error; + + // Initialize symbol group lookup table + error = MatcherT::PrepareLUT(allocations[MEM_SYMBOL_MATCHER], + allocation_sizes[MEM_SYMBOL_MATCHER], + symbol_vec, + symbol_matcher_init, + stream); + if (error) return error; + + // Initialize state transition table + error = TransitionTableT::CreateTransitionTable( + allocations[MEM_TT], allocation_sizes[MEM_TT], tt_vec, tt_init, stream); + if (error) return error; + + // Initialize finite-state transducer lookup table + error = TransducerTableT::CreateTransitionTable( + allocations[MEM_OUT_TT], allocation_sizes[MEM_OUT_TT], out_tt_vec, tt_out_init, stream); + if (error) return error; + + return error; + } + + template + cudaError_t Transduce(void* d_temp_storage, + size_t& temp_storage_bytes, + SymbolT const* d_chars, + OffsetT num_chars, + TransducedOutItT d_out_it, + TransducedIndexOutItT d_out_idx_it, + TransducedCountOutItT d_num_transduced_out_it, + const uint32_t seed_state = 0, + cudaStream_t stream = 0) + { + return DeviceTransduce(d_temp_storage, + temp_storage_bytes, + *this, + d_chars, + num_chars, + d_out_it, + d_out_idx_it, + d_num_transduced_out_it, + seed_state, + stream); + } +}; + +} // namespace fst +} // namespace io +} // namespace cudf diff --git a/cpp/src/io/fst/dispatch_dfa.cuh b/cpp/src/io/fst/dispatch_dfa.cuh new file mode 100644 index 00000000000..fc14faaf10a --- /dev/null +++ b/cpp/src/io/fst/dispatch_dfa.cuh @@ -0,0 +1,462 @@ +/* + * Copyright (c) 2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include "agent_dfa.cuh" +#include "in_reg_array.cuh" + +#include + +#include + +namespace cudf { +namespace io { +namespace fst { +namespace detail { + +/** + * @brief The tuning policy comprising all the architecture-specific compile-time tuning parameters. + * + * @tparam _BLOCK_THREADS Number of threads per block + * @tparam _ITEMS_PER_THREAD Number of symbols processed by each thread + */ +template +struct AgentDFAPolicy { + // The number of threads per block + static constexpr int32_t BLOCK_THREADS = _BLOCK_THREADS; + + // The number of symbols processed by each thread + static constexpr int32_t ITEMS_PER_THREAD = _ITEMS_PER_THREAD; +}; + +/** + * @brief The list of architecture-specific tuning policies. Yet TBD. + */ +struct DeviceFSMPolicy { + //------------------------------------------------------------------------------ + // Architecture-specific tuning policies + //------------------------------------------------------------------------------ + struct Policy900 : cub::ChainedPolicy<900, Policy900, Policy900> { + enum { + BLOCK_THREADS = 128, + ITEMS_PER_THREAD = 32, + }; + + using AgentDFAPolicy = AgentDFAPolicy; + }; + + // Top-of-list of the tuning policy "chain" + using MaxPolicy = Policy900; +}; + +/** + * @brief Kernel for initializing single-pass prefix scan tile states + * + * @param items_state The tile state + * @param num_tiles The number of tiles to be initialized + * @return + */ +template +__global__ void initialization_pass_kernel(TileState items_state, uint32_t num_tiles) +{ + items_state.InitializeStatus(num_tiles); +} + +template +struct DispatchFSM : DeviceFSMPolicy { + //------------------------------------------------------------------------------ + // DEFAULT TYPES + //------------------------------------------------------------------------------ + using StateIndexT = uint32_t; + using BlockOffsetT = uint32_t; + + //------------------------------------------------------------------------------ + // DERIVED CONFIGS + //------------------------------------------------------------------------------ + // DFA-specific configs + static constexpr int32_t MAX_NUM_STATES = DfaT::MAX_NUM_STATES; + static constexpr int32_t MAX_NUM_SYMBOLS = DfaT::MAX_NUM_SYMBOLS; + + // Whether to use a single-pass prefix scan that does all in on + static constexpr bool SINGLE_PASS_STV = false; + + // Whether this is a finite-state transform + static constexpr bool IS_FST = true; + + //------------------------------------------------------------------------------ + // TYPEDEFS + //------------------------------------------------------------------------------ + using StateVectorCompositeOpT = VectorCompositeOp; + + //------------------------------------------------------------------------------ + // MEMBER VARS + //------------------------------------------------------------------------------ + void* d_temp_storage; + size_t& temp_storage_bytes; + DfaT dfa; + StateIndexT seed_state; + SymbolItT d_chars_in; + OffsetT num_chars; + TransducedOutItT transduced_out_it; + TransducedIndexOutItT transduced_out_idx_it; + TransducedCountOutItT d_num_transduced_out_it; + cudaStream_t stream; + int ptx_version; + + //------------------------------------------------------------------------------ + // CONSTRUCTOR + //------------------------------------------------------------------------------ + CUB_RUNTIME_FUNCTION __forceinline__ DispatchFSM(void* d_temp_storage, + size_t& temp_storage_bytes, + DfaT dfa, + StateIndexT seed_state, + SymbolItT d_chars_in, + OffsetT num_chars, + TransducedOutItT transduced_out_it, + TransducedIndexOutItT transduced_out_idx_it, + TransducedCountOutItT d_num_transduced_out_it, + cudaStream_t stream, + int ptx_version) + : d_temp_storage(d_temp_storage), + temp_storage_bytes(temp_storage_bytes), + dfa(dfa), + seed_state(seed_state), + d_chars_in(d_chars_in), + num_chars(num_chars), + transduced_out_it(transduced_out_it), + transduced_out_idx_it(transduced_out_idx_it), + d_num_transduced_out_it(d_num_transduced_out_it), + stream(stream), + ptx_version(ptx_version) + { + } + + //------------------------------------------------------------------------------ + // DISPATCH INTERFACE + //------------------------------------------------------------------------------ + CUB_RUNTIME_FUNCTION __forceinline__ static cudaError_t Dispatch( + void* d_temp_storage, + size_t& temp_storage_bytes, + DfaT dfa, + StateIndexT seed_state, + SymbolItT d_chars_in, + OffsetT num_chars, + TransducedOutItT transduced_out_it, + TransducedIndexOutItT transduced_out_idx_it, + TransducedCountOutItT d_num_transduced_out_it, + cudaStream_t stream) + { + using MaxPolicyT = DispatchFSM::MaxPolicy; + + cudaError_t error; + + // Get PTX version + int ptx_version; + error = cub::PtxVersion(ptx_version); + if (error) return error; + + // Create dispatch functor + DispatchFSM dispatch(d_temp_storage, + temp_storage_bytes, + dfa, + seed_state, + d_chars_in, + num_chars, + transduced_out_it, + transduced_out_idx_it, + d_num_transduced_out_it, + stream, + ptx_version); + + error = MaxPolicyT::Invoke(ptx_version, dispatch); + return error; + } + + //------------------------------------------------------------------------------ + // DFA SIMULATION KERNEL INVOCATION + //------------------------------------------------------------------------------ + template + CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t + InvokeDFASimulationKernel(DFASimulationKernelT dfa_kernel, + int32_t sm_count, + StateIndexT seed_state, + StateVectorT* d_thread_state_transition, + TileStateT tile_state, + FstScanTileStateT fst_tile_state) + + { + cudaError_t error = cudaSuccess; + cub::KernelConfig dfa_simulation_config; + + using PolicyT = typename ActivePolicyT::AgentDFAPolicy; + if (CubDebug(error = dfa_simulation_config.Init(dfa_kernel))) return error; + + // Kernel invocation + uint32_t grid_size = + CUB_QUOTIENT_CEILING(num_chars, PolicyT::BLOCK_THREADS * PolicyT::ITEMS_PER_THREAD); + uint32_t block_threads = dfa_simulation_config.block_threads; + + dfa_kernel<<>>(dfa, + d_chars_in, + num_chars, + seed_state, + d_thread_state_transition, + tile_state, + fst_tile_state, + transduced_out_it, + transduced_out_idx_it, + d_num_transduced_out_it); + + // Check for errors + if (CubDebug(error = cudaPeekAtLastError())) return error; + + return error; + } + + /** + * @brief Computes the state-transition vectors + */ + template + CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t + ComputeStateTransitionVector(uint32_t sm_count, + TileStateT tile_state, + FstScanTileStateT fst_tile_state, + StateVectorT* d_thread_state_transition) + { + StateIndexT seed_state = 0; + + return InvokeDFASimulationKernel( + SimulateDFAKernel, + sm_count, + seed_state, + d_thread_state_transition, + tile_state, + fst_tile_state); + } + + /** + * @brief Performs the actual DFA simulation. + */ + template + CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t + SimulateDFA(uint32_t sm_count, + TileStateT tile_state, + FstScanTileStateT fst_tile_state, + StateIndexT seed_state, + StateVectorT* d_thread_state_transition) + { + return InvokeDFASimulationKernel( + SimulateDFAKernel, + sm_count, + seed_state, + d_thread_state_transition, + tile_state, + fst_tile_state); + } + + //------------------------------------------------------------------------------ + // POLICY INVOKATION + //------------------------------------------------------------------------------ + template + CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t Invoke() + { + cudaError_t error = cudaSuccess; + + // Get SM count + int device_ordinal; + int sm_count; + + // Get current device + error = cudaGetDevice(&device_ordinal); + if (error) + + error = cudaDeviceGetAttribute(&sm_count, cudaDevAttrMultiProcessorCount, device_ordinal); + if (error) return error; + + //------------------------------------------------------------------------------ + // DERIVED TYPEDEFS + //------------------------------------------------------------------------------ + // Type used to represent state-transition vectors + using StateVectorT = MultiFragmentInRegArray; + + // Scan tile state used for propagating composed state transition vectors + using ScanTileStateT = typename cub::ScanTileState; + + // Scan tile state used for propagating transduced output offsets + using FstScanTileStateT = typename cub::ScanTileState; + + // STATE-TRANSITION IDENTITY VECTOR + StateVectorT state_identity_vector; + for (int32_t i = 0; i < MAX_NUM_STATES; ++i) { + state_identity_vector.Set(i, i); + } + StateVectorCompositeOpT state_vector_scan_op; + + //------------------------------------------------------------------------------ + // DERIVED CONFIGS + //------------------------------------------------------------------------------ + enum { + BLOCK_THREADS = ActivePolicyT::BLOCK_THREADS, + SYMBOLS_PER_THREAD = ActivePolicyT::ITEMS_PER_THREAD, + NUM_SYMBOLS_PER_BLOCK = BLOCK_THREADS * SYMBOLS_PER_THREAD + }; + + BlockOffsetT num_blocks = CUB_QUOTIENT_CEILING(num_chars, NUM_SYMBOLS_PER_BLOCK); + size_t num_threads = num_blocks * BLOCK_THREADS; + + //------------------------------------------------------------------------------ + // TEMPORARY MEMORY REQUIREMENTS + //------------------------------------------------------------------------------ + enum { MEM_STATE_VECTORS = 0, MEM_SCAN, MEM_SINGLE_PASS_STV, MEM_FST_OFFSET, NUM_ALLOCATIONS }; + + size_t allocation_sizes[NUM_ALLOCATIONS] = {0}; + void* allocations[NUM_ALLOCATIONS] = {0}; + + size_t vector_scan_storage_bytes = 0; + + // [MEMORY REQUIREMENTS] STATE-TRANSITION SCAN + cub::DeviceScan::ExclusiveScan(nullptr, + vector_scan_storage_bytes, + static_cast(allocations[MEM_STATE_VECTORS]), + static_cast(allocations[MEM_STATE_VECTORS]), + state_vector_scan_op, + state_identity_vector, + num_threads, + stream); + + allocation_sizes[MEM_STATE_VECTORS] = num_threads * sizeof(StateVectorT); + allocation_sizes[MEM_SCAN] = vector_scan_storage_bytes; + + // Bytes needed for tile status descriptors (fusing state-transition vector + DFA simulation) + if (SINGLE_PASS_STV) { + error = ScanTileStateT::AllocationSize(num_blocks, allocation_sizes[MEM_SINGLE_PASS_STV]); + if (error) return error; + } + + // Bytes needed for tile status descriptors (DFA simulation pass for output size computation + + // output-generating pass) + if (IS_FST) { + error = FstScanTileStateT::AllocationSize(num_blocks, allocation_sizes[MEM_FST_OFFSET]); + if (error) return error; + } + + // Alias the temporary allocations from the single storage blob (or compute the necessary size + // of the blob) + error = + cub::AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes); + if (error) return error; + + // Return if the caller is simply requesting the size of the storage allocation + if (d_temp_storage == NULL) return cudaSuccess; + + // Alias memory for state-transition vectors + StateVectorT* d_thread_state_transition = + static_cast(allocations[MEM_STATE_VECTORS]); + + //------------------------------------------------------------------------------ + // INITIALIZE SCAN TILE STATES COMPUTING TRANSDUCED OUTPUT OFFSETS + //------------------------------------------------------------------------------ + FstScanTileStateT fst_offset_tile_state; + if (IS_FST) { + // Construct the tile status (aliases memory internally et al.) + error = fst_offset_tile_state.Init( + num_blocks, allocations[MEM_FST_OFFSET], allocation_sizes[MEM_FST_OFFSET]); + if (error) return error; + constexpr uint32_t FST_INIT_TPB = 256; + uint32_t num_fst_init_blocks = CUB_QUOTIENT_CEILING(num_blocks, FST_INIT_TPB); + initialization_pass_kernel<<>>( + fst_offset_tile_state, num_blocks); + } + + //------------------------------------------------------------------------------ + // COMPUTE STATE-TRANSITION VECTORS + //------------------------------------------------------------------------------ + ScanTileStateT stv_tile_state; + if constexpr(SINGLE_PASS_STV) { + // Construct the tile status (aliases memory internally et al.) + error = stv_tile_state.Init( + num_blocks, allocations[MEM_SINGLE_PASS_STV], allocation_sizes[MEM_SINGLE_PASS_STV]); + if (error) return error; + constexpr uint32_t STV_INIT_TPB = 256; + uint32_t num_stv_init_blocks = CUB_QUOTIENT_CEILING(num_blocks, STV_INIT_TPB); + initialization_pass_kernel<<>>(stv_tile_state, + num_blocks); + } else { + // Compute state-transition vectors + // TODO tag dispatch or constexpr if depending on single-pass config to avoid superfluous + // template instantiations + ComputeStateTransitionVector( + sm_count, stv_tile_state, fst_offset_tile_state, d_thread_state_transition); + + // State-transition vector scan computing using the composition operator + cub::DeviceScan::ExclusiveScan(allocations[MEM_SCAN], + allocation_sizes[MEM_SCAN], + d_thread_state_transition, + d_thread_state_transition, + state_vector_scan_op, + state_identity_vector, + num_threads, + stream); + } + + //------------------------------------------------------------------------------ + // SIMULATE DFA + //------------------------------------------------------------------------------ + return SimulateDFA( + sm_count, stv_tile_state, fst_offset_tile_state, seed_state, d_thread_state_transition); + } +}; +} // namespace detail +} // namespace fst +} // namespace io +} // namespace cudf diff --git a/cpp/src/io/fst/in_reg_array.cuh b/cpp/src/io/fst/in_reg_array.cuh new file mode 100644 index 00000000000..f9619c82fe8 --- /dev/null +++ b/cpp/src/io/fst/in_reg_array.cuh @@ -0,0 +1,138 @@ +/* + * Copyright (c) 2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include + +#include + +namespace cudf { +namespace io { +namespace fst { +namespace detail { + +/** + * @brief A bit-packed array of items that can be backed by registers yet allows to be dynamically + * addressed at runtime. The data struture is explained in greater detail in the paper ParPaRaw: Massively Parallel Parsing of + * Delimiter-Separated Raw Data. + * + * @tparam NUM_ITEMS The maximum number of items this data structure is supposed to store + * @tparam MAX_ITEM_VALUE The maximum value that one item can represent + * @tparam BackingFragmentT The data type that is holding the fragments + */ +template +struct MultiFragmentInRegArray { + /// [b] Minimum number of bits required to represent all values from [0, MAX_ITEM_VALUE] + static constexpr uint32_t MIN_BITS_PER_ITEM = + (MAX_ITEM_VALUE == 0) ? 1 : cub::Log2<(MAX_ITEM_VALUE + 1)>::VALUE; + + /// Number of bits that each fragment can store + static constexpr uint32_t NUM_BITS_PER_FRAGMENT = sizeof(BackingFragmentT) * 8; + + /// [a] The number of bits per fragment per item in the array + static constexpr uint32_t AVAIL_BITS_PER_FRAG_ITEM = NUM_BITS_PER_FRAGMENT / NUM_ITEMS; + + /// [k] The number of bits per item per fragment to be a power of two to avoid costly integer + /// multiplication + /// TODO: specialise for VOLTA and later architectures that have efficient integer multiplication + static constexpr uint32_t BITS_PER_FRAG_ITEM = + 0x01U << (cub::Log2<(AVAIL_BITS_PER_FRAG_ITEM + 1)>::VALUE - 1); + static constexpr uint32_t LOG2_BITS_PER_FRAG_ITEM = cub::Log2::VALUE; + + // [f] Number of fragments required to store and to reconstruct an item + static constexpr uint32_t FRAGMENTS_PER_ITEM = + (MIN_BITS_PER_ITEM + BITS_PER_FRAG_ITEM - 1) / BITS_PER_FRAG_ITEM; + + //------------------------------------------------------------------------------ + // MEMBER VARIABLES + //------------------------------------------------------------------------------ + __device__ __host__ __forceinline__ unsigned int bfe(const unsigned int& data, + unsigned int bit_start, + unsigned int num_bits) const + { +#if CUB_PTX_ARCH > 0 + return cub::BFE(data, bit_start, num_bits); +#else + const unsigned int MASK = (1 << num_bits) - 1; + return (data >> bit_start) & MASK; +#endif + } + + __device__ __host__ __forceinline__ void bfi(unsigned int& data, + unsigned int bits, + unsigned int bit_start, + unsigned int num_bits) const + { +#if CUB_PTX_ARCH > 0 + cub::BFI(data, data, bits, bit_start, num_bits); +#else + unsigned int x = bits << bit_start; + unsigned int y = data; + unsigned int MASK_X = ((1 << num_bits) - 1) << bit_start; + unsigned int MASK_Y = ~MASK_X; + data = (y & MASK_Y) | (x & MASK_X); +#endif + } + + BackingFragmentT data[FRAGMENTS_PER_ITEM]; + + //------------------------------------------------------------------------------ + // ACCESSORS + //------------------------------------------------------------------------------ + __host__ __device__ __forceinline__ uint32_t Get(int32_t index) const + { + uint32_t val = 0; + + // #pragma unroll + for (uint32_t i = 0; i < FRAGMENTS_PER_ITEM; ++i) { + val = val | bfe(data[i], index * BITS_PER_FRAG_ITEM, BITS_PER_FRAG_ITEM) + << (i * BITS_PER_FRAG_ITEM); + } + return val; + } + + __host__ __device__ __forceinline__ void Set(uint32_t index, uint32_t value) + { + // #pragma unroll + for (uint32_t i = 0; i < FRAGMENTS_PER_ITEM; ++i) { + uint32_t frag_bits = bfe(value, i * BITS_PER_FRAG_ITEM, BITS_PER_FRAG_ITEM); + bfi(data[i], frag_bits, index * BITS_PER_FRAG_ITEM, BITS_PER_FRAG_ITEM); + } + } + + //------------------------------------------------------------------------------ + // CONSTRUCTORS + //------------------------------------------------------------------------------ + __host__ __device__ __forceinline__ MultiFragmentInRegArray() + { + for (uint32_t i = 0; i < FRAGMENTS_PER_ITEM; ++i) { + data[i] = 0; + } + } + + __host__ __device__ __forceinline__ MultiFragmentInRegArray(uint32_t const (&array)[NUM_ITEMS]) + { + for (uint32_t i = 0; i < NUM_ITEMS; ++i) { + Set(i, array[i]); + } + } +}; + +} // namespace detail +} // namespace fst +} // namespace io +} // namespace cudf diff --git a/cpp/src/io/fst/symbol_lut.cuh b/cpp/src/io/fst/symbol_lut.cuh new file mode 100644 index 00000000000..08d5f4db58d --- /dev/null +++ b/cpp/src/io/fst/symbol_lut.cuh @@ -0,0 +1,182 @@ +/* + * Copyright (c) 2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include + +#include +#include +#include + +namespace cudf { +namespace io { +namespace fst { +namespace detail { +/** + * @brief Class template that can be plugged into the finite-state machine to look up the symbol + * group index for a given symbol. Class template does not support multi-symbol lookups (i.e., no + * look-ahead). + * + * @tparam SymbolT The symbol type being passed in to lookup the corresponding symbol group id + */ +template +struct SingleSymbolSmemLUT { + //------------------------------------------------------------------------------ + // DEFAULT TYPEDEFS + //------------------------------------------------------------------------------ + // Type used for representing a symbol group id (i.e., what we return for a given symbol) + using SymbolGroupIdT = uint8_t; + + //------------------------------------------------------------------------------ + // DERIVED CONFIGURATIONS + //------------------------------------------------------------------------------ + /// Number of entries for every lookup (e.g., for 8-bit Symbol this is 256) + static constexpr uint32_t NUM_ENTRIES_PER_LUT = 0x01U << (sizeof(SymbolT) * 8U); + + //------------------------------------------------------------------------------ + // TYPEDEFS + //------------------------------------------------------------------------------ + + struct _TempStorage { + // d_match_meta_data[symbol] -> symbol group index + SymbolGroupIdT match_meta_data[NUM_ENTRIES_PER_LUT]; + }; + + struct KernelParameter { + // d_match_meta_data[min(symbol,num_valid_entries)] -> symbol group index + SymbolGroupIdT num_valid_entries; + + // d_match_meta_data[symbol] -> symbol group index + SymbolGroupIdT* d_match_meta_data; + }; + + struct TempStorage : cub::Uninitialized<_TempStorage> { + }; + + //------------------------------------------------------------------------------ + // HELPER METHODS + //------------------------------------------------------------------------------ + /** + * @brief + * + * @param[in] d_temp_storage Device-side temporary storage that can be used to store the lookup + * table. If no storage is provided it will return the temporary storage requirements in \p + * d_temp_storage_bytes. + * @param[in,out] d_temp_storage_bytes Amount of device-side temporary storage that can be used in + * the number of bytes + * @param[in] symbol_strings Array of strings, where the i-th string holds all symbols + * (characters!) that correspond to the i-th symbol group index + * @param[out] kernel_param The kernel parameter object to be initialized with the given mapping + * of symbols to symbol group ids. + * @param[in] stream The stream that shall be used to cudaMemcpyAsync the lookup table + * @return + */ + template + __host__ __forceinline__ static cudaError_t PrepareLUT(void* d_temp_storage, + size_t& d_temp_storage_bytes, + SymbolGroupItT const& symbol_strings, + KernelParameter& kernel_param, + cudaStream_t stream = 0) + { + // The symbol group index to be returned if none of the given symbols match + SymbolGroupIdT no_match_id = symbol_strings.size(); + + std::vector lut(NUM_ENTRIES_PER_LUT); + SymbolGroupIdT max_base_match_val = 0; + + // Initialize all entries: by default we return the no-match-id + for (uint32_t i = 0; i < NUM_ENTRIES_PER_LUT; ++i) { + lut[i] = no_match_id; + } + + // Set up lookup table + uint32_t sg_id = 0; + for (auto const& sg_symbols : symbol_strings) { + for (auto const& sg_symbol : sg_symbols) { + max_base_match_val = std::max(max_base_match_val, static_cast(sg_symbol)); + lut[sg_symbol] = sg_id; + } + sg_id++; + } + + // Initialize the out-of-bounds lookup: d_match_meta_data[max_base_match_val+1] -> no_match_id + lut[max_base_match_val + 1] = no_match_id; + + // Alias memory / return memory requiremenets + kernel_param.num_valid_entries = max_base_match_val + 2; + if (d_temp_storage) { + cudaError_t error = cudaMemcpyAsync(d_temp_storage, + lut.data(), + kernel_param.num_valid_entries * sizeof(SymbolGroupIdT), + cudaMemcpyHostToDevice, + stream); + + kernel_param.d_match_meta_data = reinterpret_cast(d_temp_storage); + return error; + } else { + d_temp_storage_bytes = kernel_param.num_valid_entries * sizeof(SymbolGroupIdT); + return cudaSuccess; + } + + return cudaSuccess; + } + + //------------------------------------------------------------------------------ + // MEMBER VARIABLES + //------------------------------------------------------------------------------ + _TempStorage& temp_storage; + SymbolGroupIdT num_valid_entries; + + //------------------------------------------------------------------------------ + // CONSTRUCTOR + //------------------------------------------------------------------------------ + __device__ __forceinline__ _TempStorage& PrivateStorage() + { + __shared__ _TempStorage private_storage; + return private_storage; + } + + __host__ __device__ __forceinline__ SingleSymbolSmemLUT(KernelParameter const& kernel_param, + TempStorage& temp_storage) + : temp_storage(temp_storage.Alias()), num_valid_entries(kernel_param.num_valid_entries) + { + // GPU-side init +#if CUB_PTX_ARCH > 0 + for (int32_t i = threadIdx.x; i < kernel_param.num_valid_entries; i += blockDim.x) { + this->temp_storage.match_meta_data[i] = kernel_param.d_match_meta_data[i]; + } + __syncthreads(); + +#else + // CPU-side init + for (std::size_t i = 0; i < kernel_param.num_luts; i++) { + this->temp_storage.match_meta_data[i] = kernel_param.d_match_meta_data[i]; + } +#endif + } + + __host__ __device__ __forceinline__ int32_t operator()(SymbolT const symbol) const + { + // Look up the symbol group for given symbol + return temp_storage.match_meta_data[min(symbol, num_valid_entries - 1)]; + } +}; + +} // namespace detail +} // namespace fst +} // namespace io +} // namespace cudf diff --git a/cpp/src/io/fst/transition_table.cuh b/cpp/src/io/fst/transition_table.cuh new file mode 100644 index 00000000000..97fef03d8af --- /dev/null +++ b/cpp/src/io/fst/transition_table.cuh @@ -0,0 +1,149 @@ +/* + * Copyright (c) 2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include + +#include + +namespace cudf { +namespace io { +namespace fst { +namespace detail { + +template +struct TransitionTable { + //------------------------------------------------------------------------------ + // DEFAULT TYPEDEFS + //------------------------------------------------------------------------------ + using ItemT = char; + + struct TransitionVectorWrapper { + const ItemT* data; + + __host__ __device__ TransitionVectorWrapper(const ItemT* data) : data(data) {} + + __host__ __device__ __forceinline__ uint32_t Get(int32_t index) const { return data[index]; } + }; + + //------------------------------------------------------------------------------ + // TYPEDEFS + //------------------------------------------------------------------------------ + using TransitionVectorT = TransitionVectorWrapper; + + struct _TempStorage { + // + ItemT transitions[MAX_NUM_STATES * MAX_NUM_SYMBOLS]; + }; + + struct TempStorage : cub::Uninitialized<_TempStorage> { + }; + + struct KernelParameter { + ItemT* transitions; + }; + + using LoadAliasT = std::uint32_t; + + static constexpr std::size_t NUM_AUX_MEM_BYTES = + CUB_QUOTIENT_CEILING(MAX_NUM_STATES * MAX_NUM_SYMBOLS * sizeof(ItemT), sizeof(LoadAliasT)) * + sizeof(LoadAliasT); + + //------------------------------------------------------------------------------ + // HELPER METHODS + //------------------------------------------------------------------------------ + __host__ static cudaError_t CreateTransitionTable( + void* d_temp_storage, + size_t& temp_storage_bytes, + const std::vector>& trans_table, + KernelParameter& kernel_param, + cudaStream_t stream = 0) + { + if (!d_temp_storage) { + temp_storage_bytes = NUM_AUX_MEM_BYTES; + return cudaSuccess; + } + + // trans_vectors[symbol][state] -> new_state + ItemT trans_vectors[MAX_NUM_STATES * MAX_NUM_SYMBOLS]; + + // trans_table[state][symbol] -> new state + for (std::size_t state = 0; state < trans_table.size(); ++state) { + for (std::size_t symbol = 0; symbol < trans_table[state].size(); ++symbol) { + trans_vectors[symbol * MAX_NUM_STATES + state] = trans_table[state][symbol]; + } + } + + kernel_param.transitions = static_cast(d_temp_storage); + + // Copy transition table to device + return cudaMemcpyAsync( + d_temp_storage, trans_vectors, NUM_AUX_MEM_BYTES, cudaMemcpyHostToDevice, stream); + } + + //------------------------------------------------------------------------------ + // MEMBER VARIABLES + //------------------------------------------------------------------------------ + _TempStorage& temp_storage; + + __device__ __forceinline__ _TempStorage& PrivateStorage() + { + __shared__ _TempStorage private_storage; + return private_storage; + } + + //------------------------------------------------------------------------------ + // CONSTRUCTOR + //------------------------------------------------------------------------------ + __host__ __device__ __forceinline__ TransitionTable(const KernelParameter& kernel_param, + TempStorage& temp_storage) + : temp_storage(temp_storage.Alias()) + { +#if CUB_PTX_ARCH > 0 + for (int i = threadIdx.x; i < CUB_QUOTIENT_CEILING(NUM_AUX_MEM_BYTES, sizeof(LoadAliasT)); + i += blockDim.x) { + reinterpret_cast(this->temp_storage.transitions)[i] = + reinterpret_cast(kernel_param.transitions)[i]; + } + __syncthreads(); +#else + for (int i = 0; i < kernel_param.num_luts; i++) { + this->temp_storage.transitions[i] = kernel_param.transitions[i]; + } +#endif + } + + /** + * @brief Returns a random-access iterator to lookup all the state transitions for one specific + * symbol from an arbitrary old_state, i.e., it[old_state] -> new_state. + * + * @param state_id The DFA's current state index from which we'll transition + * @param match_id The symbol group id of the symbol that we just read in + * @return + */ + template + __host__ __device__ __forceinline__ int32_t operator()(StateIndexT state_id, + SymbolIndexT match_id) const + { + return temp_storage.transitions[match_id * MAX_NUM_STATES + state_id]; + } +}; + +} // namespace detail +} // namespace fst +} // namespace io +} // namespace cudf diff --git a/cpp/src/io/fst/translation_table.cuh b/cpp/src/io/fst/translation_table.cuh new file mode 100644 index 00000000000..bfbfd41e3f0 --- /dev/null +++ b/cpp/src/io/fst/translation_table.cuh @@ -0,0 +1,200 @@ +/* + * Copyright (c) 2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "in_reg_array.cuh" + +#include + +#include + +namespace cudf { +namespace io { +namespace fst { +namespace detail { + +/** + * @brief Lookup table mapping (old_state, symbol_group_id) transitions to a sequence of symbols to + * output + * + * @tparam OutSymbolT The symbol type being returned + * @tparam OutSymbolOffsetT Type sufficiently large to index into the lookup table of output symbols + * @tparam MAX_NUM_SYMBOLS The maximum number of symbols being output by a single state transition + * @tparam MAX_NUM_STATES The maximum number of states that this lookup table shall support + * @tparam MAX_TABLE_SIZE The maximum number of items in the lookup table of output symbols + */ +template +struct TransducerLookupTable { + //------------------------------------------------------------------------------ + // TYPEDEFS + //------------------------------------------------------------------------------ + struct _TempStorage { + OutSymbolOffsetT out_offset[MAX_NUM_STATES * MAX_NUM_SYMBOLS + 1]; + OutSymbolT out_symbols[MAX_TABLE_SIZE]; + }; + + struct TempStorage : cub::Uninitialized<_TempStorage> { + }; + + struct KernelParameter { + OutSymbolOffsetT* d_trans_offsets; + OutSymbolT* d_out_symbols; + }; + + //------------------------------------------------------------------------------ + // HELPER METHODS + //------------------------------------------------------------------------------ + __host__ static cudaError_t CreateTransitionTable( + void* d_temp_storage, + size_t& temp_storage_bytes, + const std::vector>>& trans_table, + KernelParameter& kernel_param, + cudaStream_t stream = 0) + { + enum { MEM_OFFSETS = 0, MEM_OUT_SYMBOLS, NUM_ALLOCATIONS }; + + size_t allocation_sizes[NUM_ALLOCATIONS] = {}; + void* allocations[NUM_ALLOCATIONS] = {}; + allocation_sizes[MEM_OFFSETS] = + (MAX_NUM_STATES * MAX_NUM_SYMBOLS + 1) * sizeof(OutSymbolOffsetT); + allocation_sizes[MEM_OUT_SYMBOLS] = MAX_TABLE_SIZE * sizeof(OutSymbolT); + + // Alias the temporary allocations from the single storage blob (or compute the necessary size + // of the blob) + cudaError_t error = + cub::AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes); + if (error) return error; + + // Return if the caller is simply requesting the size of the storage allocation + if (d_temp_storage == nullptr) return cudaSuccess; + + std::vector out_symbols; + out_symbols.reserve(MAX_TABLE_SIZE); + std::vector out_symbol_offsets; + out_symbol_offsets.reserve(MAX_NUM_STATES * MAX_NUM_SYMBOLS + 1); + out_symbol_offsets.push_back(0); + + int st = 0; + // Iterate over the states in the transition table + for (auto const& state_trans : trans_table) { + uint32_t num_added = 0; + // Iterate over the symbols in the transition table + for (auto const& symbol_out : state_trans) { + // Insert the output symbols for this specific (state, symbol) transition + out_symbols.insert(std::end(out_symbols), std::begin(symbol_out), std::end(symbol_out)); + out_symbol_offsets.push_back(out_symbols.size()); + num_added++; + } + st++; + + // Copy the last offset for all symbols (to guarantee a proper lookup for omitted symbols of + // this state) + if (MAX_NUM_SYMBOLS > num_added) { + int32_t count = MAX_NUM_SYMBOLS - num_added; + auto begin_it = std::prev(std::end(out_symbol_offsets)); + std::copy(begin_it, begin_it + count, std::back_inserter(out_symbol_offsets)); + } + } + + // Check whether runtime-provided table size exceeds the compile-time given max. table size + if (out_symbols.size() > MAX_TABLE_SIZE) { return cudaErrorInvalidValue; } + + kernel_param.d_trans_offsets = static_cast(allocations[MEM_OFFSETS]); + kernel_param.d_out_symbols = static_cast(allocations[MEM_OUT_SYMBOLS]); + + // Copy out symbols + error = cudaMemcpyAsync(kernel_param.d_trans_offsets, + out_symbol_offsets.data(), + out_symbol_offsets.size() * sizeof(out_symbol_offsets[0]), + cudaMemcpyHostToDevice, + stream); + if (error) { return error; } + + // Copy offsets into output symbols + return cudaMemcpyAsync(kernel_param.d_out_symbols, + out_symbols.data(), + out_symbols.size() * sizeof(out_symbols[0]), + cudaMemcpyHostToDevice, + stream); + } + + //------------------------------------------------------------------------------ + // MEMBER VARIABLES + //------------------------------------------------------------------------------ + _TempStorage& temp_storage; + + __device__ __forceinline__ _TempStorage& PrivateStorage() + { + __shared__ _TempStorage private_storage; + return private_storage; + } + + //------------------------------------------------------------------------------ + // CONSTRUCTOR + //------------------------------------------------------------------------------ + __host__ __device__ __forceinline__ TransducerLookupTable(const KernelParameter& kernel_param, + TempStorage& temp_storage) + : temp_storage(temp_storage.Alias()) + { + constexpr uint32_t num_offsets = MAX_NUM_STATES * MAX_NUM_SYMBOLS + 1; +#if CUB_PTX_ARCH > 0 + for (int i = threadIdx.x; i < num_offsets; i += blockDim.x) { + this->temp_storage.out_offset[i] = kernel_param.d_trans_offsets[i]; + } + // Make sure all threads in the block can read out_symbol_offsets[num_offsets - 1] from shared + // memory + __syncthreads(); + for (int i = threadIdx.x; i < this->temp_storage.out_offset[num_offsets - 1]; i += blockDim.x) { + this->temp_storage.out_symbols[i] = kernel_param.d_out_symbols[i]; + } + __syncthreads(); +#else + for (int i = 0; i < num_offsets; i++) { + this->temp_storage.out_symbol_offsets[i] = kernel_param.d_trans_offsets[i]; + } + for (int i = 0; i < this->temp_storage.out_symbol_offsets[i]; i++) { + this->temp_storage.out_symbols[i] = kernel_param.d_out_symbols[i]; + } +#endif + } + + template + __host__ __device__ __forceinline__ OutSymbolT operator()(StateIndexT state_id, + SymbolIndexT match_id, + RelativeOffsetT relative_offset) const + { + auto offset = temp_storage.out_offset[state_id * MAX_NUM_SYMBOLS + match_id] + relative_offset; + return temp_storage.out_symbols[offset]; + } + + template + __host__ __device__ __forceinline__ OutSymbolOffsetT operator()(StateIndexT state_id, + SymbolIndexT match_id) const + { + return temp_storage.out_offset[state_id * MAX_NUM_SYMBOLS + match_id + 1] - + temp_storage.out_offset[state_id * MAX_NUM_SYMBOLS + match_id]; + } +}; + +} // namespace detail +} // namespace fst +} // namespace io +} // namespace cudf diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt index a1e3cfed286..587f317692f 100644 --- a/cpp/tests/CMakeLists.txt +++ b/cpp/tests/CMakeLists.txt @@ -225,6 +225,7 @@ ConfigureTest(JSON_TEST io/json_test.cpp) ConfigureTest(ARROW_IO_SOURCE_TEST io/arrow_io_source_test.cpp) ConfigureTest(MULTIBYTE_SPLIT_TEST io/text/multibyte_split_test.cpp) ConfigureTest(LOGICAL_STACK_TEST io/fst/logical_stack_test.cu) +ConfigureTest(FST_TEST io/fst/fst_test.cu) if(CUDF_ENABLE_ARROW_S3) target_compile_definitions(ARROW_IO_SOURCE_TEST PRIVATE "S3_ENABLED") endif() diff --git a/cpp/tests/io/fst/fst_test.cu b/cpp/tests/io/fst/fst_test.cu new file mode 100644 index 00000000000..26bb9d47dca --- /dev/null +++ b/cpp/tests/io/fst/fst_test.cu @@ -0,0 +1,291 @@ +/* + * Copyright (c) 2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include + +#include +#include + +#include + +#include +#include +#include + +#include "cub/cub.cuh" + +#include +#include + +namespace { + +//------------------------------------------------------------------------------ +// CPU-BASED IMPLEMENTATIONS FOR VERIFICATION +//------------------------------------------------------------------------------ +/** + * @brief CPU-based implementation of a finite-state transducer (FST). + * + * @tparam InputItT Forward input iterator type to symbols fed into the FST + * @tparam StateT Type representing states of the finite-state machine + * @tparam SymbolGroupLutT Sequence container of symbol groups. Each symbol group is a sequence + * container to symbols within that group. + * @tparam TransitionTableT Two-dimensional container type + * @tparam TransducerTableT Two-dimensional container type + * @tparam OutputItT Forward output iterator type + * @tparam IndexOutputItT Forward output iterator type + * @param[in] begin Forward iterator to the beginning of the symbol sequence + * @param[in] end Forward iterator to one past the last element of the symbol sequence + * @param[in] init_state The starting state of the finite-state machine + * @param[in] symbol_group_lut Sequence container of symbol groups. Each symbol group is a sequence + * container to symbols within that group. The index of the symbol group containing a symbol being + * read will be used as symbol_gid of the transition and translation tables. + * @param[in] transition_table The two-dimensional transition table, i.e., + * transition_table[state][symbol_gid] -> new_state + * @param[in] translation_table The two-dimensional transducer table, i.e., + * translation_table[state][symbol_gid] -> range_of_output_symbols + * @param[out] out_tape A forward output iterator to which the transduced input will be written + * @param[out] out_index_tape A forward output iterator to which indexes of the symbols that + * actually caused some output are written to + * @return A pair of iterators to one past the last element of (1) the transduced output symbol + * sequence and (2) the indexes of + */ +template +static std::pair fst_baseline(InputItT begin, + InputItT end, + StateT const& init_state, + SymbolGroupLutT symbol_group_lut, + TransitionTableT transition_table, + TransducerTableT translation_table, + OutputItT out_tape, + IndexOutputItT out_index_tape) +{ + // Initialize "FSM" with starting state + StateT state = init_state; + + // To track the symbol offset within the input that caused the FST to output + std::size_t in_offset = 0; + for (auto it = begin; it < end; it++) { + // The symbol currently being read + auto const& symbol = *it; + + std::size_t symbol_group = 0; + bool found = false; + + // Iterate over symbol groups and search for the first symbol group containing the current + // symbol + for (auto const& sg : symbol_group_lut) { + for (auto const& s : sg) + if (s == symbol) found = true; + if (found) break; + symbol_group++; + } + + // Output the translated symbols to the output tape + size_t inserted = 0; + for (auto out : translation_table[state][symbol_group]) { + // std::cout << in_offset << ": " << out << "\n"; + *out_tape = out; + ++out_tape; + inserted++; + } + + // Output the index of the current symbol, iff it caused some output to be written + if (inserted > 0) { + *out_index_tape = in_offset; + out_index_tape++; + } + + // Transition the state of the finite-state machine + state = transition_table[state][symbol_group]; + + in_offset++; + } + return {out_tape, out_index_tape}; +} + +//------------------------------------------------------------------------------ +// TEST FST SPECIFICATIONS +//------------------------------------------------------------------------------ +// FST to check for brackets and braces outside of pairs of quotes +// The state being active while being outside of a string. When encountering an opening bracket +// or curly brace, we push it onto the stack. When encountering a closing bracket or brace, we +// pop it from the stack. +constexpr uint32_t TT_OOS = 0U; + +// The state being active while being within a string (e.g., field name or a string value). We do +// not push or pop from the stack while being in this state. +constexpr uint32_t TT_STR = 1U; + +// The state being active after encountering an escape symbol (e.g., '\') while being in the TT_STR +// state. constexpr uint32_t TT_ESC = 2U; // cmt to avoid 'unused' warning + +// Total number of states +constexpr uint32_t TT_NUM_STATES = 3U; + +// Definition of the symbol groups +enum PDA_SG_ID { + OBC = 0U, ///< Opening brace SG: { + OBT, ///< Opening bracket SG: [ + CBC, ///< Closing brace SG: } + CBT, ///< Closing bracket SG: ] + QTE, ///< Quote character SG: " + ESC, ///< Escape character SG: '\' + OTR, ///< SG implicitly matching all other characters + NUM_SYMBOL_GROUPS ///< Total number of symbol groups +}; + +// Transition table +const std::vector> pda_state_tt = { + /* IN_STATE { [ } ] " \ OTHER */ + /* TT_OOS */ {TT_OOS, TT_OOS, TT_OOS, TT_OOS, TT_STR, TT_OOS, TT_OOS}, + /* TT_STR */ {TT_STR, TT_STR, TT_STR, TT_STR, TT_OOS, TT_STR, TT_STR}, + /* TT_ESC */ {TT_STR, TT_STR, TT_STR, TT_STR, TT_STR, TT_STR, TT_STR}}; + +// Translation table (i.e., for each transition, what are the symbols that we output) +const std::vector>> pda_out_tt = { + /* IN_STATE { [ } ] " \ OTHER */ + /* TT_OOS */ {{'{'}, {'['}, {'}'}, {']'}, {'x'}, {'x'}, {'x'}}, + /* TT_STR */ {{'x'}, {'x'}, {'x'}, {'x'}, {'x'}, {'x'}, {'x'}}, + /* TT_ESC */ {{'x'}, {'x'}, {'x'}, {'x'}, {'x'}, {'x'}, {'x'}}}; + +// The i-th string representing all the characters of a symbol group +const std::vector pda_sgs = {"{", "[", "}", "]", "\"", "\\"}; + +// The DFA's starting state +constexpr int32_t start_state = TT_OOS; + +} // namespace + +// Base test fixture for tests +struct FstTest : public cudf::test::BaseFixture { +}; + +TEST_F(FstTest, GroundTruth) +{ + // Type used to represent the atomic symbol type used within the finite-state machine + using SymbolT = char; + + // Type sufficiently large to index symbols within the input and output (may be unsigned) + using SymbolOffsetT = uint32_t; + + // Helper class to set up transition table, symbol group lookup table, and translation table + using DfaFstT = cudf::io::fst::Dfa; + + // Prepare cuda stream for data transfers & kernels + cudaStream_t stream = nullptr; + cudaStreamCreate(&stream); + rmm::cuda_stream_view stream_view(stream); + + // Test input + std::string input = R"( {)" + R"(category": "reference",)" + R"("index:" [4,12,42],)" + R"("author": "Nigel Rees",)" + R"("title": "Sayings of the Century",)" + R"("price": 8.95)" + R"(} )" + R"({)" + R"("category": "reference",)" + R"("index:" [4,{},null,{"a":[]}],)" + R"("author": "Nigel Rees",)" + R"("title": "Sayings of the Century",)" + R"("price": 8.95)" + R"(} {} [] [ ])"; + + // Repeat input sample 1024x + for (std::size_t i = 0; i < 10; i++) + input += input; + + // Prepare input & output buffers + rmm::device_uvector d_input(input.size(), stream_view); + hostdevice_vector output_gpu(input.size(), stream_view); + hostdevice_vector out_indexes_gpu(input.size(), stream_view); + ASSERT_CUDA_SUCCEEDED(cudaMemcpyAsync( + d_input.data(), input.data(), input.size() * sizeof(SymbolT), cudaMemcpyHostToDevice, stream)); + + // Run algorithm + DfaFstT parser; + + // Initialize DFA + ASSERT_CUDA_SUCCEEDED(parser.Init(pda_sgs, pda_state_tt, pda_out_tt, stream)); + + std::size_t temp_storage_bytes = 0; + + // Query temporary storage requirements + ASSERT_CUDA_SUCCEEDED(parser.Transduce(nullptr, + temp_storage_bytes, + d_input.data(), + static_cast(d_input.size()), + output_gpu.device_ptr(), + out_indexes_gpu.device_ptr(), + cub::DiscardOutputIterator{}, + start_state, + stream)); + + // Allocate device-side temporary storage & run algorithm + rmm::device_buffer temp_storage{temp_storage_bytes, stream_view}; + ASSERT_CUDA_SUCCEEDED(parser.Transduce(temp_storage.data(), + temp_storage_bytes, + d_input.data(), + static_cast(d_input.size()), + output_gpu.device_ptr(), + out_indexes_gpu.device_ptr(), + cub::DiscardOutputIterator{}, + start_state, + stream)); + + // Async copy results from device to host + output_gpu.device_to_host(stream_view); + out_indexes_gpu.device_to_host(stream_view); + + // Prepare CPU-side results for verification + std::string output_cpu{}; + std::vector out_index_cpu{}; + output_cpu.reserve(input.size()); + out_index_cpu.reserve(input.size()); + + // Run CPU-side algorithm + fst_baseline(std::begin(input), + std::end(input), + start_state, + pda_sgs, + pda_state_tt, + pda_out_tt, + std::back_inserter(output_cpu), + std::back_inserter(out_index_cpu)); + + // Make sure results have been copied back to host + cudaStreamSynchronize(stream); + + // Verify results + ASSERT_EQ(output_gpu.size(), output_cpu.size()); + ASSERT_EQ(out_indexes_gpu.size(), out_index_cpu.size()); + for (std::size_t i = 0; i < output_gpu.size(); i++) { + ASSERT_EQ(output_gpu.host_ptr()[i], output_cpu[i]) << "Mismatch at index #" << i; + } + for (std::size_t i = 0; i < out_indexes_gpu.size(); i++) { + ASSERT_EQ(out_indexes_gpu.host_ptr()[i], out_index_cpu[i]) << "Mismatch at index #" << i; + } +} + +CUDF_TEST_PROGRAM_MAIN() From 355d1e43e29e4eeadc21f9d4d9e6aa43ee8afe9b Mon Sep 17 00:00:00 2001 From: Elias Stehle <3958403+elstehle@users.noreply.github.com> Date: Wed, 20 Apr 2022 05:11:32 -0700 Subject: [PATCH 02/38] clean up & addressing review comments --- cpp/src/io/fst/agent_dfa.cuh | 28 ++++++++++++---------- cpp/src/io/fst/dispatch_dfa.cuh | 4 ++-- cpp/src/io/fst/in_reg_array.cuh | 42 ++++++++++++++++----------------- 3 files changed, 38 insertions(+), 36 deletions(-) diff --git a/cpp/src/io/fst/agent_dfa.cuh b/cpp/src/io/fst/agent_dfa.cuh index d983f9287a9..0611973f78c 100644 --- a/cpp/src/io/fst/agent_dfa.cuh +++ b/cpp/src/io/fst/agent_dfa.cuh @@ -17,8 +17,6 @@ #include "in_reg_array.cuh" -#include - #include namespace cudf { @@ -40,13 +38,13 @@ template class MultiItemStateVector { public: template - constexpr CUDF_HOST_DEVICE void Set(IndexT index, StateIndexT value) noexcept + __host__ __device__ __forceinline__ void Set(IndexT index, StateIndexT value) noexcept { state_[index] = value; } template - constexpr CUDF_HOST_DEVICE StateIndexT Get(IndexT index) const noexcept + __host__ __device__ __forceinline__ StateIndexT Get(IndexT index) const noexcept { return state_[index]; } @@ -71,7 +69,7 @@ class MultiItemStateVector { template struct VectorCompositeOp { template - constexpr CUDF_HOST_DEVICE VectorT operator()(VectorT const& lhs, VectorT const& rhs) + __host__ __device__ __forceinline__ VectorT operator()(VectorT const& lhs, VectorT const& rhs) { VectorT res; for (int32_t i = 0; i < NUM_ITEMS; ++i) { @@ -109,8 +107,7 @@ class DFASimulationCallbackWrapper { uint32_t count = transducer_table(old_state.Get(0), symbol_id); if (write) { for (uint32_t out_char = 0; out_char < count; out_char++) { - out_it[out_count + out_char] = - transducer_table(old_state.Get(0), symbol_id, out_char); + out_it[out_count + out_char] = transducer_table(old_state.Get(0), symbol_id, out_char); out_idx_it[out_count + out_char] = offset + character_index; } } @@ -188,8 +185,8 @@ struct StateTransitionOp { __host__ __device__ __forceinline__ void ReadSymbol(const CharIndexT& character_index, const SymbolIndexT& read_symbol_id) { - using TransitionVectorT= typename TransitionTableT::TransitionVectorT ; - old_state_vector = state_vector; + using TransitionVectorT = typename TransitionTableT::TransitionVectorT; + old_state_vector = state_vector; state_vector.Set(0, transition_table(state_vector.Get(0), read_symbol_id)); callback_op.ReadSymbol(character_index, old_state_vector, state_vector, read_symbol_id); } @@ -344,7 +341,8 @@ struct AgentDFA { { AliasedLoadT thread_units[UINTS_PER_THREAD]; - const AliasedLoadT* d_block_symbols = reinterpret_cast(d_chars + block_offset); + const AliasedLoadT* d_block_symbols = + reinterpret_cast(d_chars + block_offset); cub::LoadDirectStriped(threadIdx.x, d_block_symbols, thread_units); #pragma unroll @@ -370,7 +368,8 @@ struct AgentDFA { OffsetT num_total_units = CUB_QUOTIENT_CEILING(num_total_symbols - block_offset, sizeof(AliasedLoadT)); - const AliasedLoadT* d_block_symbols = reinterpret_cast(d_chars + block_offset); + const AliasedLoadT* d_block_symbols = + reinterpret_cast(d_chars + block_offset); cub::LoadDirectStriped( threadIdx.x, d_block_symbols, thread_units, num_total_units); @@ -419,7 +418,8 @@ struct AgentDFA { const OffsetT num_total_symbols, StateVectorT& state_vector) { - using StateVectorTransitionOpT = StateVectorTransitionOp; + using StateVectorTransitionOpT = + StateVectorTransitionOp; // Start parsing and to transition states StateVectorTransitionOpT transition_op(transition_table, state_vector); @@ -650,7 +650,9 @@ __launch_bounds__(int32_t(AgentDFAPolicy::BLOCK_THREADS)) __global__ } // Perform finite-state machine simulation, computing size of transduced output - DFASimulationCallbackWrapper + DFASimulationCallbackWrapper callback_wrapper(transducer_table, transduced_out_it, transduced_out_idx_it); MultiItemStateVector t_start_state; diff --git a/cpp/src/io/fst/dispatch_dfa.cuh b/cpp/src/io/fst/dispatch_dfa.cuh index fc14faaf10a..316d6ea0d5f 100644 --- a/cpp/src/io/fst/dispatch_dfa.cuh +++ b/cpp/src/io/fst/dispatch_dfa.cuh @@ -119,7 +119,7 @@ struct DispatchFSM : DeviceFSMPolicy { TransducedIndexOutItT transduced_out_idx_it; TransducedCountOutItT d_num_transduced_out_it; cudaStream_t stream; - int ptx_version; + int const ptx_version; //------------------------------------------------------------------------------ // CONSTRUCTOR @@ -422,7 +422,7 @@ struct DispatchFSM : DeviceFSMPolicy { // COMPUTE STATE-TRANSITION VECTORS //------------------------------------------------------------------------------ ScanTileStateT stv_tile_state; - if constexpr(SINGLE_PASS_STV) { + if constexpr (SINGLE_PASS_STV) { // Construct the tile status (aliases memory internally et al.) error = stv_tile_state.Init( num_blocks, allocations[MEM_SINGLE_PASS_STV], allocation_sizes[MEM_SINGLE_PASS_STV]); diff --git a/cpp/src/io/fst/in_reg_array.cuh b/cpp/src/io/fst/in_reg_array.cuh index f9619c82fe8..ed5948249d4 100644 --- a/cpp/src/io/fst/in_reg_array.cuh +++ b/cpp/src/io/fst/in_reg_array.cuh @@ -35,56 +35,55 @@ namespace detail { * @tparam BackingFragmentT The data type that is holding the fragments */ template -struct MultiFragmentInRegArray { - /// [b] Minimum number of bits required to represent all values from [0, MAX_ITEM_VALUE] +class MultiFragmentInRegArray { + private: + /// Minimum number of bits required to represent all values from [0, MAX_ITEM_VALUE] static constexpr uint32_t MIN_BITS_PER_ITEM = (MAX_ITEM_VALUE == 0) ? 1 : cub::Log2<(MAX_ITEM_VALUE + 1)>::VALUE; /// Number of bits that each fragment can store static constexpr uint32_t NUM_BITS_PER_FRAGMENT = sizeof(BackingFragmentT) * 8; - /// [a] The number of bits per fragment per item in the array + /// The number of bits per fragment per item in the array static constexpr uint32_t AVAIL_BITS_PER_FRAG_ITEM = NUM_BITS_PER_FRAGMENT / NUM_ITEMS; - /// [k] The number of bits per item per fragment to be a power of two to avoid costly integer + /// The number of bits per item per fragment to be a power of two to avoid costly integer /// multiplication - /// TODO: specialise for VOLTA and later architectures that have efficient integer multiplication static constexpr uint32_t BITS_PER_FRAG_ITEM = 0x01U << (cub::Log2<(AVAIL_BITS_PER_FRAG_ITEM + 1)>::VALUE - 1); - static constexpr uint32_t LOG2_BITS_PER_FRAG_ITEM = cub::Log2::VALUE; - // [f] Number of fragments required to store and to reconstruct an item + // Number of fragments required to store and to reconstruct each item static constexpr uint32_t FRAGMENTS_PER_ITEM = (MIN_BITS_PER_ITEM + BITS_PER_FRAG_ITEM - 1) / BITS_PER_FRAG_ITEM; //------------------------------------------------------------------------------ - // MEMBER VARIABLES + // HELPER FUNCTIONS //------------------------------------------------------------------------------ - __device__ __host__ __forceinline__ unsigned int bfe(const unsigned int& data, - unsigned int bit_start, - unsigned int num_bits) const + __device__ __host__ __forceinline__ uint32_t bfe(const uint32_t& data, + uint32_t bit_start, + uint32_t num_bits) const { #if CUB_PTX_ARCH > 0 return cub::BFE(data, bit_start, num_bits); #else - const unsigned int MASK = (1 << num_bits) - 1; + const uint32_t MASK = (1 << num_bits) - 1; return (data >> bit_start) & MASK; #endif } - __device__ __host__ __forceinline__ void bfi(unsigned int& data, - unsigned int bits, - unsigned int bit_start, - unsigned int num_bits) const + __device__ __host__ __forceinline__ void bfi(uint32_t& data, + uint32_t bits, + uint32_t bit_start, + uint32_t num_bits) const { #if CUB_PTX_ARCH > 0 cub::BFI(data, data, bits, bit_start, num_bits); #else - unsigned int x = bits << bit_start; - unsigned int y = data; - unsigned int MASK_X = ((1 << num_bits) - 1) << bit_start; - unsigned int MASK_Y = ~MASK_X; - data = (y & MASK_Y) | (x & MASK_X); + uint32_t x = bits << bit_start; + uint32_t y = data; + uint32_t MASK_X = ((1 << num_bits) - 1) << bit_start; + uint32_t MASK_Y = ~MASK_X; + data = (y & MASK_Y) | (x & MASK_X); #endif } @@ -93,6 +92,7 @@ struct MultiFragmentInRegArray { //------------------------------------------------------------------------------ // ACCESSORS //------------------------------------------------------------------------------ + public: __host__ __device__ __forceinline__ uint32_t Get(int32_t index) const { uint32_t val = 0; From 39a6b65c9fc4ad12d33155b54c8373b98de2de43 Mon Sep 17 00:00:00 2001 From: Elias Stehle <3958403+elstehle@users.noreply.github.com> Date: Mon, 25 Apr 2022 09:59:37 -0700 Subject: [PATCH 03/38] refactored lookup tables --- cpp/src/io/fst/agent_dfa.cuh | 3 - cpp/src/io/fst/device_dfa.cuh | 192 +++++++++++++-------------- cpp/src/io/fst/symbol_lut.cuh | 94 +++++-------- cpp/src/io/fst/transition_table.cuh | 109 +++++---------- cpp/src/io/fst/translation_table.cuh | 123 +++++++---------- cpp/tests/io/fst/fst_test.cu | 5 +- 6 files changed, 209 insertions(+), 317 deletions(-) diff --git a/cpp/src/io/fst/agent_dfa.cuh b/cpp/src/io/fst/agent_dfa.cuh index 0611973f78c..3bc59160696 100644 --- a/cpp/src/io/fst/agent_dfa.cuh +++ b/cpp/src/io/fst/agent_dfa.cuh @@ -152,8 +152,6 @@ class StateVectorTransitionOp : public StateTransitionCallbackOp { __host__ __device__ __forceinline__ void ReadSymbol(CharIndexT const& character_index, SymbolIndexT const read_symbol_id) const { - using TransitionVectorT = typename TransitionTableT::TransitionVectorT; - for (int32_t i = 0; i < NUM_INSTANCES; ++i) { state_vector.Set(i, transition_table(state_vector.Get(i), read_symbol_id)); } @@ -185,7 +183,6 @@ struct StateTransitionOp { __host__ __device__ __forceinline__ void ReadSymbol(const CharIndexT& character_index, const SymbolIndexT& read_symbol_id) { - using TransitionVectorT = typename TransitionTableT::TransitionVectorT; old_state_vector = state_vector; state_vector.Set(0, transition_table(state_vector.Get(0), read_symbol_id)); callback_op.ReadSymbol(character_index, old_state_vector, state_vector, read_symbol_id); diff --git a/cpp/src/io/fst/device_dfa.cuh b/cpp/src/io/fst/device_dfa.cuh index 795c4c98bec..b12283a9673 100644 --- a/cpp/src/io/fst/device_dfa.cuh +++ b/cpp/src/io/fst/device_dfa.cuh @@ -15,8 +15,9 @@ */ #pragma once -#include "cub/util_type.cuh" #include "dispatch_dfa.cuh" + +#include #include #include #include @@ -95,140 +96,121 @@ cudaError_t DeviceTransduce(void* d_temp_storage, stream); } -/** - * @brief Helper class to facilitate the specification and instantiation of a DFA (i.e., the - * transition table and its number of states, the mapping of symbols to symbol groups, and the - * translation table that specifies which state transitions cause which output to be written). - * - * @tparam OutSymbolT The symbol type being output by the finite-state transducer - * @tparam NUM_SYMBOLS The number of symbol groups amongst which to differentiate (one dimension of - * the transition table) - * @tparam TT_NUM_STATES The number of states defined by the DFA (the other dimension of the - * transition table) - */ -template -class Dfa { +template +class dfa_device_view { + private: + using sgid_lut_init_t = typename SymbolGroupIdLookupT::KernelParameter; + using transition_table_init_t = typename TransitionTableT::KernelParameter; + using translation_table_init_t = typename TranslationTableT::KernelParameter; + public: // The maximum number of states supported by this DFA instance // This is a value queried by the DFA simulation algorithm - static constexpr int32_t MAX_NUM_STATES = TT_NUM_STATES; + static constexpr int32_t MAX_NUM_STATES = NUM_STATES; - private: - // Symbol-group id lookup table - using MatcherT = detail::SingleSymbolSmemLUT; - using MatcherInitT = typename MatcherT::KernelParameter; - - // Transition table - using TransitionTableT = detail::TransitionTable; - using TransitionTableInitT = typename TransitionTableT::KernelParameter; - - // Translation lookup table - using OutSymbolOffsetT = uint32_t; - using TransducerTableT = detail::TransducerLookupTable; - using TransducerTableInitT = typename TransducerTableT::KernelParameter; - - // Private members (passed between host/device) - /// Information to initialize the device-side lookup table that maps symbol -> symbol group id - MatcherInitT symbol_matcher_init; - - /// Information to initialize the device-side transition table - TransitionTableInitT tt_init; - - /// Information to initialize the device-side translation table - TransducerTableInitT tt_out_init; - - public: //--------------------------------------------------------------------- // DEVICE-SIDE MEMBER FUNCTIONS //--------------------------------------------------------------------- - using SymbolGroupStorageT = typename MatcherT::TempStorage; + using SymbolGroupStorageT = typename SymbolGroupIdLookupT::TempStorage; using TransitionTableStorageT = typename TransitionTableT::TempStorage; - using TranslationTableStorageT = typename TransducerTableT::TempStorage; + using TranslationTableStorageT = typename TranslationTableT::TempStorage; __device__ auto InitSymbolGroupLUT(SymbolGroupStorageT& temp_storage) { - return MatcherT(symbol_matcher_init, temp_storage); + return SymbolGroupIdLookupT(*d_sgid_lut_init, temp_storage); } __device__ auto InitTransitionTable(TransitionTableStorageT& temp_storage) { - return TransitionTableT(tt_init, temp_storage); + return TransitionTableT(*d_transition_table_init, temp_storage); } __device__ auto InitTranslationTable(TranslationTableStorageT& temp_storage) { - return TransducerTableT(tt_out_init, temp_storage); + return TranslationTableT(*d_translation_table_init, temp_storage); } - //--------------------------------------------------------------------- - // HOST-SIDE MEMBER FUNCTIONS - //--------------------------------------------------------------------- - template - cudaError_t Init(SymbolGroupIdItT const& symbol_vec, - std::vector> const& tt_vec, - std::vector>> const& out_tt_vec, - cudaStream_t stream = 0) + dfa_device_view(sgid_lut_init_t const* d_sgid_lut_init, + transition_table_init_t const* d_transition_table_init, + translation_table_init_t const* d_translation_table_init) + : d_sgid_lut_init(d_sgid_lut_init), + d_transition_table_init(d_transition_table_init), + d_translation_table_init(d_translation_table_init) { - cudaError_t error = cudaSuccess; - - enum : uint32_t { MEM_SYMBOL_MATCHER = 0, MEM_TT, MEM_OUT_TT, NUM_ALLOCATIONS }; + } - size_t allocation_sizes[NUM_ALLOCATIONS] = {0}; - void* allocations[NUM_ALLOCATIONS] = {0}; + private: + sgid_lut_init_t const* d_sgid_lut_init; + transition_table_init_t const* d_transition_table_init; + translation_table_init_t const* d_translation_table_init; +}; - // Memory requirements: lookup table - error = MatcherT::PrepareLUT( - nullptr, allocation_sizes[MEM_SYMBOL_MATCHER], symbol_vec, symbol_matcher_init); - if (error) return error; +/** + * @brief Helper class to facilitate the specification and instantiation of a DFA (i.e., the + * transition table and its number of states, the mapping of symbols to symbol groups, and the + * translation table that specifies which state transitions cause which output to be written). + * + * @tparam OutSymbolT The symbol type being output by the finite-state transducer + * @tparam NUM_SYMBOLS The number of symbol groups amongst which to differentiate (one dimension of + * the transition table) + * @tparam NUM_STATES The number of states defined by the DFA (the other dimension of the + * transition table) + */ +template +class Dfa { + public: + // The maximum number of states supported by this DFA instance + // This is a value queried by the DFA simulation algorithm + static constexpr int32_t MAX_NUM_STATES = NUM_STATES; - // Memory requirements: transition table - error = - TransitionTableT::CreateTransitionTable(nullptr, allocation_sizes[MEM_TT], tt_vec, tt_init); - if (error) return error; + private: + // Symbol-group id lookup table + using SymbolGroupIdLookupT = detail::SingleSymbolSmemLUT; + using SymbolGroupIdInitT = typename SymbolGroupIdLookupT::KernelParameter; - // Memory requirements: transducer table - error = TransducerTableT::CreateTransitionTable( - nullptr, allocation_sizes[MEM_OUT_TT], out_tt_vec, tt_out_init); - if (error) return error; + // Transition table + using TransitionTableT = detail::TransitionTable; + using TransitionTableInitT = typename TransitionTableT::KernelParameter; - // Memory requirements: total memory - size_t temp_storage_bytes = 0; - error = cub::AliasTemporaries(nullptr, temp_storage_bytes, allocations, allocation_sizes); - if (error) return error; + // Translation lookup table + using OutSymbolOffsetT = uint32_t; + using TranslationTableT = detail::TransducerLookupTable; + using TranslationTableInitT = typename TranslationTableT::KernelParameter; + + auto get_device_view() + { + return dfa_device_view{ + sgid_init.d_begin(), transition_table_init.d_begin(), translation_table_init.d_begin()}; + } - // Allocate memory - void* d_temp_storage = nullptr; - error = cudaMalloc(&d_temp_storage, temp_storage_bytes); - if (error) return error; + public: + template + Dfa(SymbolGroupIdItT const& symbol_vec, + std::vector> const& tt_vec, + std::vector>> const& out_tt_vec, + cudaStream_t stream) + { + constexpr std::size_t single_item = 1; - // Alias memory - error = - cub::AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes); - if (error) return error; + sgid_init = hostdevice_vector{single_item, stream}; + transition_table_init = hostdevice_vector{single_item, stream}; + translation_table_init = hostdevice_vector{single_item, stream}; - // Initialize symbol group lookup table - error = MatcherT::PrepareLUT(allocations[MEM_SYMBOL_MATCHER], - allocation_sizes[MEM_SYMBOL_MATCHER], - symbol_vec, - symbol_matcher_init, - stream); - if (error) return error; + // Initialize symbol group id lookup table + SymbolGroupIdLookupT::InitDeviceSymbolGroupIdLut(sgid_init, symbol_vec, stream); // Initialize state transition table - error = TransitionTableT::CreateTransitionTable( - allocations[MEM_TT], allocation_sizes[MEM_TT], tt_vec, tt_init, stream); - if (error) return error; + TransitionTableT::InitDeviceTransitionTable(transition_table_init, tt_vec, stream); // Initialize finite-state transducer lookup table - error = TransducerTableT::CreateTransitionTable( - allocations[MEM_OUT_TT], allocation_sizes[MEM_OUT_TT], out_tt_vec, tt_out_init, stream); - if (error) return error; - - return error; + TranslationTableT::InitDeviceTranslationTable(translation_table_init, out_tt_vec, stream); } template get_device_view(), d_chars, num_chars, d_out_it, @@ -257,8 +239,12 @@ class Dfa { seed_state, stream); } -}; + private: + hostdevice_vector sgid_init{}; + hostdevice_vector transition_table_init{}; + hostdevice_vector translation_table_init{}; +}; } // namespace fst } // namespace io } // namespace cudf diff --git a/cpp/src/io/fst/symbol_lut.cuh b/cpp/src/io/fst/symbol_lut.cuh index 08d5f4db58d..abf71a7fbea 100644 --- a/cpp/src/io/fst/symbol_lut.cuh +++ b/cpp/src/io/fst/symbol_lut.cuh @@ -16,6 +16,9 @@ #pragma once +#include +#include + #include #include @@ -34,38 +37,29 @@ namespace detail { * @tparam SymbolT The symbol type being passed in to lookup the corresponding symbol group id */ template -struct SingleSymbolSmemLUT { - //------------------------------------------------------------------------------ - // DEFAULT TYPEDEFS - //------------------------------------------------------------------------------ +class SingleSymbolSmemLUT { + private: // Type used for representing a symbol group id (i.e., what we return for a given symbol) using SymbolGroupIdT = uint8_t; - //------------------------------------------------------------------------------ - // DERIVED CONFIGURATIONS - //------------------------------------------------------------------------------ /// Number of entries for every lookup (e.g., for 8-bit Symbol this is 256) static constexpr uint32_t NUM_ENTRIES_PER_LUT = 0x01U << (sizeof(SymbolT) * 8U); - //------------------------------------------------------------------------------ - // TYPEDEFS - //------------------------------------------------------------------------------ - struct _TempStorage { - // d_match_meta_data[symbol] -> symbol group index - SymbolGroupIdT match_meta_data[NUM_ENTRIES_PER_LUT]; + // sym_to_sgid[symbol] -> symbol group index + SymbolGroupIdT sym_to_sgid[NUM_ENTRIES_PER_LUT]; }; + public: struct KernelParameter { - // d_match_meta_data[min(symbol,num_valid_entries)] -> symbol group index - SymbolGroupIdT num_valid_entries; + // sym_to_sgid[min(symbol,num_valid_entries)] -> symbol group index + SymbolT num_valid_entries; - // d_match_meta_data[symbol] -> symbol group index - SymbolGroupIdT* d_match_meta_data; + // sym_to_sgid[symbol] -> symbol group index + SymbolGroupIdT sym_to_sgid[NUM_ENTRIES_PER_LUT]; }; - struct TempStorage : cub::Uninitialized<_TempStorage> { - }; + using TempStorage = cub::Uninitialized<_TempStorage>; //------------------------------------------------------------------------------ // HELPER METHODS @@ -73,66 +67,48 @@ struct SingleSymbolSmemLUT { /** * @brief * - * @param[in] d_temp_storage Device-side temporary storage that can be used to store the lookup - * table. If no storage is provided it will return the temporary storage requirements in \p - * d_temp_storage_bytes. - * @param[in,out] d_temp_storage_bytes Amount of device-side temporary storage that can be used in - * the number of bytes + * @param[out] sgid_init A hostdevice_vector that will be populated * @param[in] symbol_strings Array of strings, where the i-th string holds all symbols * (characters!) that correspond to the i-th symbol group index - * @param[out] kernel_param The kernel parameter object to be initialized with the given mapping - * of symbols to symbol group ids. * @param[in] stream The stream that shall be used to cudaMemcpyAsync the lookup table * @return */ template - __host__ __forceinline__ static cudaError_t PrepareLUT(void* d_temp_storage, - size_t& d_temp_storage_bytes, - SymbolGroupItT const& symbol_strings, - KernelParameter& kernel_param, - cudaStream_t stream = 0) + static void InitDeviceSymbolGroupIdLut(hostdevice_vector& sgid_init, + SymbolGroupItT const& symbol_strings, + rmm::cuda_stream_view stream) { // The symbol group index to be returned if none of the given symbols match SymbolGroupIdT no_match_id = symbol_strings.size(); - std::vector lut(NUM_ENTRIES_PER_LUT); + // The symbol with the largest value that is mapped to a symbol group id SymbolGroupIdT max_base_match_val = 0; // Initialize all entries: by default we return the no-match-id - for (uint32_t i = 0; i < NUM_ENTRIES_PER_LUT; ++i) { - lut[i] = no_match_id; - } + std::fill(&sgid_init.host_ptr()->sym_to_sgid[0], + &sgid_init.host_ptr()->sym_to_sgid[NUM_ENTRIES_PER_LUT], + no_match_id); // Set up lookup table uint32_t sg_id = 0; + // Iterate over the symbol groups for (auto const& sg_symbols : symbol_strings) { + // Iterate over all symbols that belong to the current symbol group for (auto const& sg_symbol : sg_symbols) { max_base_match_val = std::max(max_base_match_val, static_cast(sg_symbol)); - lut[sg_symbol] = sg_id; + sgid_init.host_ptr()->sym_to_sgid[static_cast(sg_symbol)] = sg_id; } sg_id++; } - // Initialize the out-of-bounds lookup: d_match_meta_data[max_base_match_val+1] -> no_match_id - lut[max_base_match_val + 1] = no_match_id; + // Initialize the out-of-bounds lookup: sym_to_sgid[max_base_match_val+1] -> no_match_id + sgid_init.host_ptr()->sym_to_sgid[max_base_match_val + 1] = no_match_id; // Alias memory / return memory requiremenets - kernel_param.num_valid_entries = max_base_match_val + 2; - if (d_temp_storage) { - cudaError_t error = cudaMemcpyAsync(d_temp_storage, - lut.data(), - kernel_param.num_valid_entries * sizeof(SymbolGroupIdT), - cudaMemcpyHostToDevice, - stream); - - kernel_param.d_match_meta_data = reinterpret_cast(d_temp_storage); - return error; - } else { - d_temp_storage_bytes = kernel_param.num_valid_entries * sizeof(SymbolGroupIdT); - return cudaSuccess; - } + // TODO I think this could be +1? + sgid_init.host_ptr()->num_valid_entries = max_base_match_val + 2; - return cudaSuccess; + sgid_init.host_to_device(stream); } //------------------------------------------------------------------------------ @@ -150,29 +126,29 @@ struct SingleSymbolSmemLUT { return private_storage; } - __host__ __device__ __forceinline__ SingleSymbolSmemLUT(KernelParameter const& kernel_param, - TempStorage& temp_storage) + constexpr CUDF_HOST_DEVICE SingleSymbolSmemLUT(KernelParameter const& kernel_param, + TempStorage& temp_storage) : temp_storage(temp_storage.Alias()), num_valid_entries(kernel_param.num_valid_entries) { // GPU-side init #if CUB_PTX_ARCH > 0 for (int32_t i = threadIdx.x; i < kernel_param.num_valid_entries; i += blockDim.x) { - this->temp_storage.match_meta_data[i] = kernel_param.d_match_meta_data[i]; + this->temp_storage.sym_to_sgid[i] = kernel_param.sym_to_sgid[i]; } __syncthreads(); #else // CPU-side init for (std::size_t i = 0; i < kernel_param.num_luts; i++) { - this->temp_storage.match_meta_data[i] = kernel_param.d_match_meta_data[i]; + this->temp_storage.sym_to_sgid[i] = kernel_param.sym_to_sgid[i]; } #endif } - __host__ __device__ __forceinline__ int32_t operator()(SymbolT const symbol) const + constexpr CUDF_HOST_DEVICE int32_t operator()(SymbolT const symbol) const { // Look up the symbol group for given symbol - return temp_storage.match_meta_data[min(symbol, num_valid_entries - 1)]; + return temp_storage.sym_to_sgid[min(symbol, num_valid_entries - 1)]; } }; diff --git a/cpp/src/io/fst/transition_table.cuh b/cpp/src/io/fst/transition_table.cuh index 97fef03d8af..5eccb926974 100644 --- a/cpp/src/io/fst/transition_table.cuh +++ b/cpp/src/io/fst/transition_table.cuh @@ -16,6 +16,10 @@ #pragma once +#include +#include +#include + #include #include @@ -25,103 +29,50 @@ namespace io { namespace fst { namespace detail { -template -struct TransitionTable { - //------------------------------------------------------------------------------ - // DEFAULT TYPEDEFS - //------------------------------------------------------------------------------ +template +class TransitionTable { + private: + // Type used using ItemT = char; - struct TransitionVectorWrapper { - const ItemT* data; - - __host__ __device__ TransitionVectorWrapper(const ItemT* data) : data(data) {} - - __host__ __device__ __forceinline__ uint32_t Get(int32_t index) const { return data[index]; } - }; - - //------------------------------------------------------------------------------ - // TYPEDEFS - //------------------------------------------------------------------------------ - using TransitionVectorT = TransitionVectorWrapper; - struct _TempStorage { - // ItemT transitions[MAX_NUM_STATES * MAX_NUM_SYMBOLS]; }; - struct TempStorage : cub::Uninitialized<_TempStorage> { - }; + public: + using TempStorage = cub::Uninitialized<_TempStorage>; struct KernelParameter { - ItemT* transitions; + ItemT transitions[MAX_NUM_STATES * MAX_NUM_SYMBOLS]; }; - using LoadAliasT = std::uint32_t; - - static constexpr std::size_t NUM_AUX_MEM_BYTES = - CUB_QUOTIENT_CEILING(MAX_NUM_STATES * MAX_NUM_SYMBOLS * sizeof(ItemT), sizeof(LoadAliasT)) * - sizeof(LoadAliasT); - - //------------------------------------------------------------------------------ - // HELPER METHODS - //------------------------------------------------------------------------------ - __host__ static cudaError_t CreateTransitionTable( - void* d_temp_storage, - size_t& temp_storage_bytes, - const std::vector>& trans_table, - KernelParameter& kernel_param, - cudaStream_t stream = 0) + static void InitDeviceTransitionTable(hostdevice_vector& transition_table_init, + const std::vector>& trans_table, + rmm::cuda_stream_view stream) { - if (!d_temp_storage) { - temp_storage_bytes = NUM_AUX_MEM_BYTES; - return cudaSuccess; - } - - // trans_vectors[symbol][state] -> new_state - ItemT trans_vectors[MAX_NUM_STATES * MAX_NUM_SYMBOLS]; - // trans_table[state][symbol] -> new state for (std::size_t state = 0; state < trans_table.size(); ++state) { for (std::size_t symbol = 0; symbol < trans_table[state].size(); ++symbol) { - trans_vectors[symbol * MAX_NUM_STATES + state] = trans_table[state][symbol]; + transition_table_init.host_ptr()->transitions[symbol * MAX_NUM_STATES + state] = + trans_table[state][symbol]; } } - kernel_param.transitions = static_cast(d_temp_storage); - // Copy transition table to device - return cudaMemcpyAsync( - d_temp_storage, trans_vectors, NUM_AUX_MEM_BYTES, cudaMemcpyHostToDevice, stream); + transition_table_init.host_to_device(stream); } - //------------------------------------------------------------------------------ - // MEMBER VARIABLES - //------------------------------------------------------------------------------ - _TempStorage& temp_storage; - - __device__ __forceinline__ _TempStorage& PrivateStorage() - { - __shared__ _TempStorage private_storage; - return private_storage; - } - - //------------------------------------------------------------------------------ - // CONSTRUCTOR - //------------------------------------------------------------------------------ - __host__ __device__ __forceinline__ TransitionTable(const KernelParameter& kernel_param, - TempStorage& temp_storage) + constexpr CUDF_HOST_DEVICE TransitionTable(const KernelParameter& kernel_param, + TempStorage& temp_storage) : temp_storage(temp_storage.Alias()) { #if CUB_PTX_ARCH > 0 - for (int i = threadIdx.x; i < CUB_QUOTIENT_CEILING(NUM_AUX_MEM_BYTES, sizeof(LoadAliasT)); - i += blockDim.x) { - reinterpret_cast(this->temp_storage.transitions)[i] = - reinterpret_cast(kernel_param.transitions)[i]; + for (int i = threadIdx.x; i < MAX_NUM_STATES * MAX_NUM_SYMBOLS; i += blockDim.x) { + this->temp_storage.transitions[i] = kernel_param.transitions[i]; } __syncthreads(); #else - for (int i = 0; i < kernel_param.num_luts; i++) { + for (int i = 0; i < MAX_NUM_STATES * MAX_NUM_SYMBOLS; i++) { this->temp_storage.transitions[i] = kernel_param.transitions[i]; } #endif @@ -136,11 +87,21 @@ struct TransitionTable { * @return */ template - __host__ __device__ __forceinline__ int32_t operator()(StateIndexT state_id, - SymbolIndexT match_id) const + constexpr CUDF_HOST_DEVICE int32_t operator()(StateIndexT const state_id, + SymbolIndexT const match_id) const { return temp_storage.transitions[match_id * MAX_NUM_STATES + state_id]; - } + } + + private: + _TempStorage& temp_storage; + + __device__ __forceinline__ _TempStorage& PrivateStorage() + { + __shared__ _TempStorage private_storage; + + return private_storage; + } }; } // namespace detail diff --git a/cpp/src/io/fst/translation_table.cuh b/cpp/src/io/fst/translation_table.cuh index bfbfd41e3f0..89da994606c 100644 --- a/cpp/src/io/fst/translation_table.cuh +++ b/cpp/src/io/fst/translation_table.cuh @@ -16,7 +16,12 @@ #pragma once -#include "in_reg_array.cuh" +#include +#include +#include +#include + +#include "rmm/device_uvector.hpp" #include @@ -28,10 +33,10 @@ namespace fst { namespace detail { /** - * @brief Lookup table mapping (old_state, symbol_group_id) transitions to a sequence of symbols to - * output + * @brief Lookup table mapping (old_state, symbol_group_id) transitions to a sequence of symbols + * that the finite-state transducer is supposed to output for each transition * - * @tparam OutSymbolT The symbol type being returned + * @tparam OutSymbolT The symbol type being output * @tparam OutSymbolOffsetT Type sufficiently large to index into the lookup table of output symbols * @tparam MAX_NUM_SYMBOLS The maximum number of symbols being output by a single state transition * @tparam MAX_NUM_STATES The maximum number of states that this lookup table shall support @@ -42,57 +47,35 @@ template -struct TransducerLookupTable { - //------------------------------------------------------------------------------ - // TYPEDEFS - //------------------------------------------------------------------------------ +class TransducerLookupTable { + private: struct _TempStorage { OutSymbolOffsetT out_offset[MAX_NUM_STATES * MAX_NUM_SYMBOLS + 1]; OutSymbolT out_symbols[MAX_TABLE_SIZE]; }; - struct TempStorage : cub::Uninitialized<_TempStorage> { - }; + public: + using TempStorage = cub::Uninitialized<_TempStorage>; struct KernelParameter { - OutSymbolOffsetT* d_trans_offsets; - OutSymbolT* d_out_symbols; + OutSymbolOffsetT d_out_offsets[MAX_NUM_STATES * MAX_NUM_SYMBOLS + 1]; + OutSymbolT d_out_symbols[MAX_TABLE_SIZE]; }; - //------------------------------------------------------------------------------ - // HELPER METHODS - //------------------------------------------------------------------------------ - __host__ static cudaError_t CreateTransitionTable( - void* d_temp_storage, - size_t& temp_storage_bytes, - const std::vector>>& trans_table, - KernelParameter& kernel_param, - cudaStream_t stream = 0) + /** + * @brief Initializes the translation table (both the host and device parts) + */ + static void InitDeviceTranslationTable( + hostdevice_vector& translation_table_init, + std::vector>> const& trans_table, + rmm::cuda_stream_view stream) { - enum { MEM_OFFSETS = 0, MEM_OUT_SYMBOLS, NUM_ALLOCATIONS }; - - size_t allocation_sizes[NUM_ALLOCATIONS] = {}; - void* allocations[NUM_ALLOCATIONS] = {}; - allocation_sizes[MEM_OFFSETS] = - (MAX_NUM_STATES * MAX_NUM_SYMBOLS + 1) * sizeof(OutSymbolOffsetT); - allocation_sizes[MEM_OUT_SYMBOLS] = MAX_TABLE_SIZE * sizeof(OutSymbolT); - - // Alias the temporary allocations from the single storage blob (or compute the necessary size - // of the blob) - cudaError_t error = - cub::AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes); - if (error) return error; - - // Return if the caller is simply requesting the size of the storage allocation - if (d_temp_storage == nullptr) return cudaSuccess; - std::vector out_symbols; out_symbols.reserve(MAX_TABLE_SIZE); std::vector out_symbol_offsets; out_symbol_offsets.reserve(MAX_NUM_STATES * MAX_NUM_SYMBOLS + 1); out_symbol_offsets.push_back(0); - int st = 0; // Iterate over the states in the transition table for (auto const& state_trans : trans_table) { uint32_t num_added = 0; @@ -103,7 +86,6 @@ struct TransducerLookupTable { out_symbol_offsets.push_back(out_symbols.size()); num_added++; } - st++; // Copy the last offset for all symbols (to guarantee a proper lookup for omitted symbols of // this state) @@ -115,30 +97,21 @@ struct TransducerLookupTable { } // Check whether runtime-provided table size exceeds the compile-time given max. table size - if (out_symbols.size() > MAX_TABLE_SIZE) { return cudaErrorInvalidValue; } - - kernel_param.d_trans_offsets = static_cast(allocations[MEM_OFFSETS]); - kernel_param.d_out_symbols = static_cast(allocations[MEM_OUT_SYMBOLS]); - - // Copy out symbols - error = cudaMemcpyAsync(kernel_param.d_trans_offsets, - out_symbol_offsets.data(), - out_symbol_offsets.size() * sizeof(out_symbol_offsets[0]), - cudaMemcpyHostToDevice, - stream); - if (error) { return error; } - - // Copy offsets into output symbols - return cudaMemcpyAsync(kernel_param.d_out_symbols, - out_symbols.data(), - out_symbols.size() * sizeof(out_symbols[0]), - cudaMemcpyHostToDevice, - stream); + if (out_symbols.size() > MAX_TABLE_SIZE) { CUDF_FAIL("Unsupported translation table"); } + + // Prepare host-side data to be copied and passed to the device + std::copy(std::cbegin(out_symbol_offsets), + std::cend(out_symbol_offsets), + translation_table_init.host_ptr()->d_out_offsets); + std::copy(std::cbegin(out_symbols), + std::cend(out_symbols), + translation_table_init.host_ptr()->d_out_symbols); + + // Copy data to device + translation_table_init.host_to_device(stream); } - //------------------------------------------------------------------------------ - // MEMBER VARIABLES - //------------------------------------------------------------------------------ + private: _TempStorage& temp_storage; __device__ __forceinline__ _TempStorage& PrivateStorage() @@ -147,17 +120,19 @@ struct TransducerLookupTable { return private_storage; } - //------------------------------------------------------------------------------ - // CONSTRUCTOR - //------------------------------------------------------------------------------ - __host__ __device__ __forceinline__ TransducerLookupTable(const KernelParameter& kernel_param, - TempStorage& temp_storage) + public: + /** + * @brief Synchronizes the thread block, if called from device, and, hence, requires all threads + * of the thread block to call the constructor + */ + CUDF_HOST_DEVICE TransducerLookupTable(KernelParameter const& kernel_param, + TempStorage& temp_storage) : temp_storage(temp_storage.Alias()) { constexpr uint32_t num_offsets = MAX_NUM_STATES * MAX_NUM_SYMBOLS + 1; #if CUB_PTX_ARCH > 0 for (int i = threadIdx.x; i < num_offsets; i += blockDim.x) { - this->temp_storage.out_offset[i] = kernel_param.d_trans_offsets[i]; + this->temp_storage.out_offset[i] = kernel_param.d_out_offsets[i]; } // Make sure all threads in the block can read out_symbol_offsets[num_offsets - 1] from shared // memory @@ -168,7 +143,7 @@ struct TransducerLookupTable { __syncthreads(); #else for (int i = 0; i < num_offsets; i++) { - this->temp_storage.out_symbol_offsets[i] = kernel_param.d_trans_offsets[i]; + this->temp_storage.out_symbol_offsets[i] = kernel_param.d_out_offsets[i]; } for (int i = 0; i < this->temp_storage.out_symbol_offsets[i]; i++) { this->temp_storage.out_symbols[i] = kernel_param.d_out_symbols[i]; @@ -177,17 +152,17 @@ struct TransducerLookupTable { } template - __host__ __device__ __forceinline__ OutSymbolT operator()(StateIndexT state_id, - SymbolIndexT match_id, - RelativeOffsetT relative_offset) const + constexpr CUDF_HOST_DEVICE OutSymbolT operator()(StateIndexT const state_id, + SymbolIndexT const match_id, + RelativeOffsetT const relative_offset) const { auto offset = temp_storage.out_offset[state_id * MAX_NUM_SYMBOLS + match_id] + relative_offset; return temp_storage.out_symbols[offset]; } template - __host__ __device__ __forceinline__ OutSymbolOffsetT operator()(StateIndexT state_id, - SymbolIndexT match_id) const + constexpr CUDF_HOST_DEVICE OutSymbolOffsetT operator()(StateIndexT const state_id, + SymbolIndexT const match_id) const { return temp_storage.out_offset[state_id * MAX_NUM_SYMBOLS + match_id + 1] - temp_storage.out_offset[state_id * MAX_NUM_SYMBOLS + match_id]; diff --git a/cpp/tests/io/fst/fst_test.cu b/cpp/tests/io/fst/fst_test.cu index 26bb9d47dca..29c93a6f3bb 100644 --- a/cpp/tests/io/fst/fst_test.cu +++ b/cpp/tests/io/fst/fst_test.cu @@ -224,10 +224,7 @@ TEST_F(FstTest, GroundTruth) d_input.data(), input.data(), input.size() * sizeof(SymbolT), cudaMemcpyHostToDevice, stream)); // Run algorithm - DfaFstT parser; - - // Initialize DFA - ASSERT_CUDA_SUCCEEDED(parser.Init(pda_sgs, pda_state_tt, pda_out_tt, stream)); + DfaFstT parser{pda_sgs, pda_state_tt, pda_out_tt, stream}; std::size_t temp_storage_bytes = 0; From 239f138d78cc12af8607f1feb7d7ec4bec2f58fc Mon Sep 17 00:00:00 2001 From: Elias Stehle <3958403+elstehle@users.noreply.github.com> Date: Mon, 25 Apr 2022 12:17:08 -0700 Subject: [PATCH 04/38] put lookup tables into their own cudf file --- cpp/src/io/fst/device_dfa.cuh | 152 -------- cpp/src/io/fst/lookup_tables.cuh | 519 +++++++++++++++++++++++++++ cpp/src/io/fst/symbol_lut.cuh | 158 -------- cpp/src/io/fst/transition_table.cuh | 110 ------ cpp/src/io/fst/translation_table.cuh | 175 --------- cpp/tests/io/fst/fst_test.cu | 4 +- 6 files changed, 521 insertions(+), 597 deletions(-) create mode 100644 cpp/src/io/fst/lookup_tables.cuh delete mode 100644 cpp/src/io/fst/symbol_lut.cuh delete mode 100644 cpp/src/io/fst/transition_table.cuh delete mode 100644 cpp/src/io/fst/translation_table.cuh diff --git a/cpp/src/io/fst/device_dfa.cuh b/cpp/src/io/fst/device_dfa.cuh index b12283a9673..d3f0e8be213 100644 --- a/cpp/src/io/fst/device_dfa.cuh +++ b/cpp/src/io/fst/device_dfa.cuh @@ -18,9 +18,6 @@ #include "dispatch_dfa.cuh" #include -#include -#include -#include #include @@ -96,155 +93,6 @@ cudaError_t DeviceTransduce(void* d_temp_storage, stream); } -template -class dfa_device_view { - private: - using sgid_lut_init_t = typename SymbolGroupIdLookupT::KernelParameter; - using transition_table_init_t = typename TransitionTableT::KernelParameter; - using translation_table_init_t = typename TranslationTableT::KernelParameter; - - public: - // The maximum number of states supported by this DFA instance - // This is a value queried by the DFA simulation algorithm - static constexpr int32_t MAX_NUM_STATES = NUM_STATES; - - //--------------------------------------------------------------------- - // DEVICE-SIDE MEMBER FUNCTIONS - //--------------------------------------------------------------------- - using SymbolGroupStorageT = typename SymbolGroupIdLookupT::TempStorage; - using TransitionTableStorageT = typename TransitionTableT::TempStorage; - using TranslationTableStorageT = typename TranslationTableT::TempStorage; - - __device__ auto InitSymbolGroupLUT(SymbolGroupStorageT& temp_storage) - { - return SymbolGroupIdLookupT(*d_sgid_lut_init, temp_storage); - } - - __device__ auto InitTransitionTable(TransitionTableStorageT& temp_storage) - { - return TransitionTableT(*d_transition_table_init, temp_storage); - } - - __device__ auto InitTranslationTable(TranslationTableStorageT& temp_storage) - { - return TranslationTableT(*d_translation_table_init, temp_storage); - } - - dfa_device_view(sgid_lut_init_t const* d_sgid_lut_init, - transition_table_init_t const* d_transition_table_init, - translation_table_init_t const* d_translation_table_init) - : d_sgid_lut_init(d_sgid_lut_init), - d_transition_table_init(d_transition_table_init), - d_translation_table_init(d_translation_table_init) - { - } - - private: - sgid_lut_init_t const* d_sgid_lut_init; - transition_table_init_t const* d_transition_table_init; - translation_table_init_t const* d_translation_table_init; -}; - -/** - * @brief Helper class to facilitate the specification and instantiation of a DFA (i.e., the - * transition table and its number of states, the mapping of symbols to symbol groups, and the - * translation table that specifies which state transitions cause which output to be written). - * - * @tparam OutSymbolT The symbol type being output by the finite-state transducer - * @tparam NUM_SYMBOLS The number of symbol groups amongst which to differentiate (one dimension of - * the transition table) - * @tparam NUM_STATES The number of states defined by the DFA (the other dimension of the - * transition table) - */ -template -class Dfa { - public: - // The maximum number of states supported by this DFA instance - // This is a value queried by the DFA simulation algorithm - static constexpr int32_t MAX_NUM_STATES = NUM_STATES; - - private: - // Symbol-group id lookup table - using SymbolGroupIdLookupT = detail::SingleSymbolSmemLUT; - using SymbolGroupIdInitT = typename SymbolGroupIdLookupT::KernelParameter; - - // Transition table - using TransitionTableT = detail::TransitionTable; - using TransitionTableInitT = typename TransitionTableT::KernelParameter; - - // Translation lookup table - using OutSymbolOffsetT = uint32_t; - using TranslationTableT = detail::TransducerLookupTable; - using TranslationTableInitT = typename TranslationTableT::KernelParameter; - - auto get_device_view() - { - return dfa_device_view{ - sgid_init.d_begin(), transition_table_init.d_begin(), translation_table_init.d_begin()}; - } - - public: - template - Dfa(SymbolGroupIdItT const& symbol_vec, - std::vector> const& tt_vec, - std::vector>> const& out_tt_vec, - cudaStream_t stream) - { - constexpr std::size_t single_item = 1; - - sgid_init = hostdevice_vector{single_item, stream}; - transition_table_init = hostdevice_vector{single_item, stream}; - translation_table_init = hostdevice_vector{single_item, stream}; - - // Initialize symbol group id lookup table - SymbolGroupIdLookupT::InitDeviceSymbolGroupIdLut(sgid_init, symbol_vec, stream); - - // Initialize state transition table - TransitionTableT::InitDeviceTransitionTable(transition_table_init, tt_vec, stream); - - // Initialize finite-state transducer lookup table - TranslationTableT::InitDeviceTranslationTable(translation_table_init, out_tt_vec, stream); - } - - template - cudaError_t Transduce(void* d_temp_storage, - size_t& temp_storage_bytes, - SymbolT const* d_chars, - OffsetT num_chars, - TransducedOutItT d_out_it, - TransducedIndexOutItT d_out_idx_it, - TransducedCountOutItT d_num_transduced_out_it, - const uint32_t seed_state = 0, - cudaStream_t stream = 0) - { - return DeviceTransduce(d_temp_storage, - temp_storage_bytes, - this->get_device_view(), - d_chars, - num_chars, - d_out_it, - d_out_idx_it, - d_num_transduced_out_it, - seed_state, - stream); - } - - private: - hostdevice_vector sgid_init{}; - hostdevice_vector transition_table_init{}; - hostdevice_vector translation_table_init{}; -}; } // namespace fst } // namespace io } // namespace cudf diff --git a/cpp/src/io/fst/lookup_tables.cuh b/cpp/src/io/fst/lookup_tables.cuh new file mode 100644 index 00000000000..58853919b69 --- /dev/null +++ b/cpp/src/io/fst/lookup_tables.cuh @@ -0,0 +1,519 @@ +/* + * Copyright (c) 2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include +#include + +#include + +#include +#include +#include + +namespace cudf { +namespace io { +namespace fst { +namespace detail { + +/** + * @brief Class template that can be plugged into the finite-state machine to look up the symbol + * group index for a given symbol. Class template does not support multi-symbol lookups (i.e., no + * look-ahead). + * + * @tparam SymbolT The symbol type being passed in to lookup the corresponding symbol group id + */ +template +class SingleSymbolSmemLUT { + private: + // Type used for representing a symbol group id (i.e., what we return for a given symbol) + using SymbolGroupIdT = uint8_t; + + /// Number of entries for every lookup (e.g., for 8-bit Symbol this is 256) + static constexpr uint32_t NUM_ENTRIES_PER_LUT = 0x01U << (sizeof(SymbolT) * 8U); + + struct _TempStorage { + // sym_to_sgid[symbol] -> symbol group index + SymbolGroupIdT sym_to_sgid[NUM_ENTRIES_PER_LUT]; + }; + + public: + struct KernelParameter { + // sym_to_sgid[min(symbol,num_valid_entries)] -> symbol group index + SymbolT num_valid_entries; + + // sym_to_sgid[symbol] -> symbol group index + SymbolGroupIdT sym_to_sgid[NUM_ENTRIES_PER_LUT]; + }; + + using TempStorage = cub::Uninitialized<_TempStorage>; + + //------------------------------------------------------------------------------ + // HELPER METHODS + //------------------------------------------------------------------------------ + /** + * @brief + * + * @param[out] sgid_init A hostdevice_vector that will be populated + * @param[in] symbol_strings Array of strings, where the i-th string holds all symbols + * (characters!) that correspond to the i-th symbol group index + * @param[in] stream The stream that shall be used to cudaMemcpyAsync the lookup table + * @return + */ + template + static void InitDeviceSymbolGroupIdLut(hostdevice_vector& sgid_init, + SymbolGroupItT const& symbol_strings, + rmm::cuda_stream_view stream) + { + // The symbol group index to be returned if none of the given symbols match + SymbolGroupIdT no_match_id = symbol_strings.size(); + + // The symbol with the largest value that is mapped to a symbol group id + SymbolGroupIdT max_base_match_val = 0; + + // Initialize all entries: by default we return the no-match-id + std::fill(&sgid_init.host_ptr()->sym_to_sgid[0], + &sgid_init.host_ptr()->sym_to_sgid[NUM_ENTRIES_PER_LUT], + no_match_id); + + // Set up lookup table + uint32_t sg_id = 0; + // Iterate over the symbol groups + for (auto const& sg_symbols : symbol_strings) { + // Iterate over all symbols that belong to the current symbol group + for (auto const& sg_symbol : sg_symbols) { + max_base_match_val = std::max(max_base_match_val, static_cast(sg_symbol)); + sgid_init.host_ptr()->sym_to_sgid[static_cast(sg_symbol)] = sg_id; + } + sg_id++; + } + + // Initialize the out-of-bounds lookup: sym_to_sgid[max_base_match_val+1] -> no_match_id + sgid_init.host_ptr()->sym_to_sgid[max_base_match_val + 1] = no_match_id; + + // Alias memory / return memory requiremenets + // TODO I think this could be +1? + sgid_init.host_ptr()->num_valid_entries = max_base_match_val + 2; + + sgid_init.host_to_device(stream); + } + + //------------------------------------------------------------------------------ + // MEMBER VARIABLES + //------------------------------------------------------------------------------ + _TempStorage& temp_storage; + SymbolGroupIdT num_valid_entries; + + //------------------------------------------------------------------------------ + // CONSTRUCTOR + //------------------------------------------------------------------------------ + __device__ __forceinline__ _TempStorage& PrivateStorage() + { + __shared__ _TempStorage private_storage; + return private_storage; + } + + constexpr CUDF_HOST_DEVICE SingleSymbolSmemLUT(KernelParameter const& kernel_param, + TempStorage& temp_storage) + : temp_storage(temp_storage.Alias()), num_valid_entries(kernel_param.num_valid_entries) + { + // GPU-side init +#if CUB_PTX_ARCH > 0 + for (int32_t i = threadIdx.x; i < kernel_param.num_valid_entries; i += blockDim.x) { + this->temp_storage.sym_to_sgid[i] = kernel_param.sym_to_sgid[i]; + } + __syncthreads(); + +#else + // CPU-side init + for (std::size_t i = 0; i < kernel_param.num_luts; i++) { + this->temp_storage.sym_to_sgid[i] = kernel_param.sym_to_sgid[i]; + } +#endif + } + + constexpr CUDF_HOST_DEVICE int32_t operator()(SymbolT const symbol) const + { + // Look up the symbol group for given symbol + return temp_storage.sym_to_sgid[min(symbol, num_valid_entries - 1)]; + } +}; + +template +class TransitionTable { + private: + // Type used + using ItemT = char; + + struct _TempStorage { + ItemT transitions[MAX_NUM_STATES * MAX_NUM_SYMBOLS]; + }; + + public: + using TempStorage = cub::Uninitialized<_TempStorage>; + + struct KernelParameter { + ItemT transitions[MAX_NUM_STATES * MAX_NUM_SYMBOLS]; + }; + + static void InitDeviceTransitionTable(hostdevice_vector& transition_table_init, + const std::vector>& trans_table, + rmm::cuda_stream_view stream) + { + // trans_table[state][symbol] -> new state + for (std::size_t state = 0; state < trans_table.size(); ++state) { + for (std::size_t symbol = 0; symbol < trans_table[state].size(); ++symbol) { + transition_table_init.host_ptr()->transitions[symbol * MAX_NUM_STATES + state] = + trans_table[state][symbol]; + } + } + + // Copy transition table to device + transition_table_init.host_to_device(stream); + } + + constexpr CUDF_HOST_DEVICE TransitionTable(const KernelParameter& kernel_param, + TempStorage& temp_storage) + : temp_storage(temp_storage.Alias()) + { +#if CUB_PTX_ARCH > 0 + for (int i = threadIdx.x; i < MAX_NUM_STATES * MAX_NUM_SYMBOLS; i += blockDim.x) { + this->temp_storage.transitions[i] = kernel_param.transitions[i]; + } + __syncthreads(); +#else + for (int i = 0; i < MAX_NUM_STATES * MAX_NUM_SYMBOLS; i++) { + this->temp_storage.transitions[i] = kernel_param.transitions[i]; + } +#endif + } + + /** + * @brief Returns a random-access iterator to lookup all the state transitions for one specific + * symbol from an arbitrary old_state, i.e., it[old_state] -> new_state. + * + * @param state_id The DFA's current state index from which we'll transition + * @param match_id The symbol group id of the symbol that we just read in + * @return + */ + template + constexpr CUDF_HOST_DEVICE int32_t operator()(StateIndexT const state_id, + SymbolIndexT const match_id) const + { + return temp_storage.transitions[match_id * MAX_NUM_STATES + state_id]; + } + + private: + _TempStorage& temp_storage; + + __device__ __forceinline__ _TempStorage& PrivateStorage() + { + __shared__ _TempStorage private_storage; + + return private_storage; + } +}; + +template +class dfa_device_view { + private: + using sgid_lut_init_t = typename SymbolGroupIdLookupT::KernelParameter; + using transition_table_init_t = typename TransitionTableT::KernelParameter; + using translation_table_init_t = typename TranslationTableT::KernelParameter; + + public: + // The maximum number of states supported by this DFA instance + // This is a value queried by the DFA simulation algorithm + static constexpr int32_t MAX_NUM_STATES = NUM_STATES; + + using SymbolGroupStorageT = typename SymbolGroupIdLookupT::TempStorage; + using TransitionTableStorageT = typename TransitionTableT::TempStorage; + using TranslationTableStorageT = typename TranslationTableT::TempStorage; + + __device__ auto InitSymbolGroupLUT(SymbolGroupStorageT& temp_storage) + { + return SymbolGroupIdLookupT(*d_sgid_lut_init, temp_storage); + } + + __device__ auto InitTransitionTable(TransitionTableStorageT& temp_storage) + { + return TransitionTableT(*d_transition_table_init, temp_storage); + } + + __device__ auto InitTranslationTable(TranslationTableStorageT& temp_storage) + { + return TranslationTableT(*d_translation_table_init, temp_storage); + } + + dfa_device_view(sgid_lut_init_t const* d_sgid_lut_init, + transition_table_init_t const* d_transition_table_init, + translation_table_init_t const* d_translation_table_init) + : d_sgid_lut_init(d_sgid_lut_init), + d_transition_table_init(d_transition_table_init), + d_translation_table_init(d_translation_table_init) + { + } + + private: + sgid_lut_init_t const* d_sgid_lut_init; + transition_table_init_t const* d_transition_table_init; + translation_table_init_t const* d_translation_table_init; +}; + +/** + * @brief Lookup table mapping (old_state, symbol_group_id) transitions to a sequence of symbols + * that the finite-state transducer is supposed to output for each transition + * + * @tparam OutSymbolT The symbol type being output + * @tparam OutSymbolOffsetT Type sufficiently large to index into the lookup table of output symbols + * @tparam MAX_NUM_SYMBOLS The maximum number of symbols being output by a single state transition + * @tparam MAX_NUM_STATES The maximum number of states that this lookup table shall support + * @tparam MAX_TABLE_SIZE The maximum number of items in the lookup table of output symbols + */ +template +class TransducerLookupTable { + private: + struct _TempStorage { + OutSymbolOffsetT out_offset[MAX_NUM_STATES * MAX_NUM_SYMBOLS + 1]; + OutSymbolT out_symbols[MAX_TABLE_SIZE]; + }; + + public: + using TempStorage = cub::Uninitialized<_TempStorage>; + + struct KernelParameter { + OutSymbolOffsetT d_out_offsets[MAX_NUM_STATES * MAX_NUM_SYMBOLS + 1]; + OutSymbolT d_out_symbols[MAX_TABLE_SIZE]; + }; + + /** + * @brief Initializes the translation table (both the host and device parts) + */ + static void InitDeviceTranslationTable( + hostdevice_vector& translation_table_init, + std::vector>> const& trans_table, + rmm::cuda_stream_view stream) + { + std::vector out_symbols; + out_symbols.reserve(MAX_TABLE_SIZE); + std::vector out_symbol_offsets; + out_symbol_offsets.reserve(MAX_NUM_STATES * MAX_NUM_SYMBOLS + 1); + out_symbol_offsets.push_back(0); + + // Iterate over the states in the transition table + for (auto const& state_trans : trans_table) { + uint32_t num_added = 0; + // Iterate over the symbols in the transition table + for (auto const& symbol_out : state_trans) { + // Insert the output symbols for this specific (state, symbol) transition + out_symbols.insert(std::end(out_symbols), std::begin(symbol_out), std::end(symbol_out)); + out_symbol_offsets.push_back(out_symbols.size()); + num_added++; + } + + // Copy the last offset for all symbols (to guarantee a proper lookup for omitted symbols of + // this state) + if (MAX_NUM_SYMBOLS > num_added) { + int32_t count = MAX_NUM_SYMBOLS - num_added; + auto begin_it = std::prev(std::end(out_symbol_offsets)); + std::copy(begin_it, begin_it + count, std::back_inserter(out_symbol_offsets)); + } + } + + // Check whether runtime-provided table size exceeds the compile-time given max. table size + if (out_symbols.size() > MAX_TABLE_SIZE) { CUDF_FAIL("Unsupported translation table"); } + + // Prepare host-side data to be copied and passed to the device + std::copy(std::cbegin(out_symbol_offsets), + std::cend(out_symbol_offsets), + translation_table_init.host_ptr()->d_out_offsets); + std::copy(std::cbegin(out_symbols), + std::cend(out_symbols), + translation_table_init.host_ptr()->d_out_symbols); + + // Copy data to device + translation_table_init.host_to_device(stream); + } + + private: + _TempStorage& temp_storage; + + __device__ __forceinline__ _TempStorage& PrivateStorage() + { + __shared__ _TempStorage private_storage; + return private_storage; + } + + public: + /** + * @brief Synchronizes the thread block, if called from device, and, hence, requires all threads + * of the thread block to call the constructor + */ + CUDF_HOST_DEVICE TransducerLookupTable(KernelParameter const& kernel_param, + TempStorage& temp_storage) + : temp_storage(temp_storage.Alias()) + { + constexpr uint32_t num_offsets = MAX_NUM_STATES * MAX_NUM_SYMBOLS + 1; +#if CUB_PTX_ARCH > 0 + for (int i = threadIdx.x; i < num_offsets; i += blockDim.x) { + this->temp_storage.out_offset[i] = kernel_param.d_out_offsets[i]; + } + // Make sure all threads in the block can read out_symbol_offsets[num_offsets - 1] from shared + // memory + __syncthreads(); + for (int i = threadIdx.x; i < this->temp_storage.out_offset[num_offsets - 1]; i += blockDim.x) { + this->temp_storage.out_symbols[i] = kernel_param.d_out_symbols[i]; + } + __syncthreads(); +#else + for (int i = 0; i < num_offsets; i++) { + this->temp_storage.out_symbol_offsets[i] = kernel_param.d_out_offsets[i]; + } + for (int i = 0; i < this->temp_storage.out_symbol_offsets[i]; i++) { + this->temp_storage.out_symbols[i] = kernel_param.d_out_symbols[i]; + } +#endif + } + + template + constexpr CUDF_HOST_DEVICE OutSymbolT operator()(StateIndexT const state_id, + SymbolIndexT const match_id, + RelativeOffsetT const relative_offset) const + { + auto offset = temp_storage.out_offset[state_id * MAX_NUM_SYMBOLS + match_id] + relative_offset; + return temp_storage.out_symbols[offset]; + } + + template + constexpr CUDF_HOST_DEVICE OutSymbolOffsetT operator()(StateIndexT const state_id, + SymbolIndexT const match_id) const + { + return temp_storage.out_offset[state_id * MAX_NUM_SYMBOLS + match_id + 1] - + temp_storage.out_offset[state_id * MAX_NUM_SYMBOLS + match_id]; + } +}; + +/** + * @brief Helper class to facilitate the specification and instantiation of a DFA (i.e., the + * transition table and its number of states, the mapping of symbols to symbol groups, and the + * translation table that specifies which state transitions cause which output to be written). + * + * @tparam OutSymbolT The symbol type being output by the finite-state transducer + * @tparam NUM_SYMBOLS The number of symbol groups amongst which to differentiate (one dimension of + * the transition table) + * @tparam NUM_STATES The number of states defined by the DFA (the other dimension of the + * transition table) + */ +template +class Dfa { + public: + // The maximum number of states supported by this DFA instance + // This is a value queried by the DFA simulation algorithm + static constexpr int32_t MAX_NUM_STATES = NUM_STATES; + + private: + // Symbol-group id lookup table + using SymbolGroupIdLookupT = detail::SingleSymbolSmemLUT; + using SymbolGroupIdInitT = typename SymbolGroupIdLookupT::KernelParameter; + + // Transition table + using TransitionTableT = detail::TransitionTable; + using TransitionTableInitT = typename TransitionTableT::KernelParameter; + + // Translation lookup table + using OutSymbolOffsetT = uint32_t; + using TranslationTableT = detail::TransducerLookupTable; + using TranslationTableInitT = typename TranslationTableT::KernelParameter; + + auto get_device_view() + { + return dfa_device_view{ + sgid_init.d_begin(), transition_table_init.d_begin(), translation_table_init.d_begin()}; + } + + public: + template + Dfa(SymbolGroupIdItT const& symbol_vec, + std::vector> const& tt_vec, + std::vector>> const& out_tt_vec, + cudaStream_t stream) + { + constexpr std::size_t single_item = 1; + + sgid_init = hostdevice_vector{single_item, stream}; + transition_table_init = hostdevice_vector{single_item, stream}; + translation_table_init = hostdevice_vector{single_item, stream}; + + // Initialize symbol group id lookup table + SymbolGroupIdLookupT::InitDeviceSymbolGroupIdLut(sgid_init, symbol_vec, stream); + + // Initialize state transition table + TransitionTableT::InitDeviceTransitionTable(transition_table_init, tt_vec, stream); + + // Initialize finite-state transducer lookup table + TranslationTableT::InitDeviceTranslationTable(translation_table_init, out_tt_vec, stream); + } + + template + cudaError_t Transduce(void* d_temp_storage, + size_t& temp_storage_bytes, + SymbolT const* d_chars, + OffsetT num_chars, + TransducedOutItT d_out_it, + TransducedIndexOutItT d_out_idx_it, + TransducedCountOutItT d_num_transduced_out_it, + const uint32_t seed_state = 0, + cudaStream_t stream = 0) + { + return DeviceTransduce(d_temp_storage, + temp_storage_bytes, + this->get_device_view(), + d_chars, + num_chars, + d_out_it, + d_out_idx_it, + d_num_transduced_out_it, + seed_state, + stream); + } + + private: + hostdevice_vector sgid_init{}; + hostdevice_vector transition_table_init{}; + hostdevice_vector translation_table_init{}; +}; + +} // namespace detail +} // namespace fst +} // namespace io +} // namespace cudf diff --git a/cpp/src/io/fst/symbol_lut.cuh b/cpp/src/io/fst/symbol_lut.cuh deleted file mode 100644 index abf71a7fbea..00000000000 --- a/cpp/src/io/fst/symbol_lut.cuh +++ /dev/null @@ -1,158 +0,0 @@ -/* - * Copyright (c) 2022, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#pragma once - -#include -#include - -#include - -#include -#include -#include - -namespace cudf { -namespace io { -namespace fst { -namespace detail { -/** - * @brief Class template that can be plugged into the finite-state machine to look up the symbol - * group index for a given symbol. Class template does not support multi-symbol lookups (i.e., no - * look-ahead). - * - * @tparam SymbolT The symbol type being passed in to lookup the corresponding symbol group id - */ -template -class SingleSymbolSmemLUT { - private: - // Type used for representing a symbol group id (i.e., what we return for a given symbol) - using SymbolGroupIdT = uint8_t; - - /// Number of entries for every lookup (e.g., for 8-bit Symbol this is 256) - static constexpr uint32_t NUM_ENTRIES_PER_LUT = 0x01U << (sizeof(SymbolT) * 8U); - - struct _TempStorage { - // sym_to_sgid[symbol] -> symbol group index - SymbolGroupIdT sym_to_sgid[NUM_ENTRIES_PER_LUT]; - }; - - public: - struct KernelParameter { - // sym_to_sgid[min(symbol,num_valid_entries)] -> symbol group index - SymbolT num_valid_entries; - - // sym_to_sgid[symbol] -> symbol group index - SymbolGroupIdT sym_to_sgid[NUM_ENTRIES_PER_LUT]; - }; - - using TempStorage = cub::Uninitialized<_TempStorage>; - - //------------------------------------------------------------------------------ - // HELPER METHODS - //------------------------------------------------------------------------------ - /** - * @brief - * - * @param[out] sgid_init A hostdevice_vector that will be populated - * @param[in] symbol_strings Array of strings, where the i-th string holds all symbols - * (characters!) that correspond to the i-th symbol group index - * @param[in] stream The stream that shall be used to cudaMemcpyAsync the lookup table - * @return - */ - template - static void InitDeviceSymbolGroupIdLut(hostdevice_vector& sgid_init, - SymbolGroupItT const& symbol_strings, - rmm::cuda_stream_view stream) - { - // The symbol group index to be returned if none of the given symbols match - SymbolGroupIdT no_match_id = symbol_strings.size(); - - // The symbol with the largest value that is mapped to a symbol group id - SymbolGroupIdT max_base_match_val = 0; - - // Initialize all entries: by default we return the no-match-id - std::fill(&sgid_init.host_ptr()->sym_to_sgid[0], - &sgid_init.host_ptr()->sym_to_sgid[NUM_ENTRIES_PER_LUT], - no_match_id); - - // Set up lookup table - uint32_t sg_id = 0; - // Iterate over the symbol groups - for (auto const& sg_symbols : symbol_strings) { - // Iterate over all symbols that belong to the current symbol group - for (auto const& sg_symbol : sg_symbols) { - max_base_match_val = std::max(max_base_match_val, static_cast(sg_symbol)); - sgid_init.host_ptr()->sym_to_sgid[static_cast(sg_symbol)] = sg_id; - } - sg_id++; - } - - // Initialize the out-of-bounds lookup: sym_to_sgid[max_base_match_val+1] -> no_match_id - sgid_init.host_ptr()->sym_to_sgid[max_base_match_val + 1] = no_match_id; - - // Alias memory / return memory requiremenets - // TODO I think this could be +1? - sgid_init.host_ptr()->num_valid_entries = max_base_match_val + 2; - - sgid_init.host_to_device(stream); - } - - //------------------------------------------------------------------------------ - // MEMBER VARIABLES - //------------------------------------------------------------------------------ - _TempStorage& temp_storage; - SymbolGroupIdT num_valid_entries; - - //------------------------------------------------------------------------------ - // CONSTRUCTOR - //------------------------------------------------------------------------------ - __device__ __forceinline__ _TempStorage& PrivateStorage() - { - __shared__ _TempStorage private_storage; - return private_storage; - } - - constexpr CUDF_HOST_DEVICE SingleSymbolSmemLUT(KernelParameter const& kernel_param, - TempStorage& temp_storage) - : temp_storage(temp_storage.Alias()), num_valid_entries(kernel_param.num_valid_entries) - { - // GPU-side init -#if CUB_PTX_ARCH > 0 - for (int32_t i = threadIdx.x; i < kernel_param.num_valid_entries; i += blockDim.x) { - this->temp_storage.sym_to_sgid[i] = kernel_param.sym_to_sgid[i]; - } - __syncthreads(); - -#else - // CPU-side init - for (std::size_t i = 0; i < kernel_param.num_luts; i++) { - this->temp_storage.sym_to_sgid[i] = kernel_param.sym_to_sgid[i]; - } -#endif - } - - constexpr CUDF_HOST_DEVICE int32_t operator()(SymbolT const symbol) const - { - // Look up the symbol group for given symbol - return temp_storage.sym_to_sgid[min(symbol, num_valid_entries - 1)]; - } -}; - -} // namespace detail -} // namespace fst -} // namespace io -} // namespace cudf diff --git a/cpp/src/io/fst/transition_table.cuh b/cpp/src/io/fst/transition_table.cuh deleted file mode 100644 index 5eccb926974..00000000000 --- a/cpp/src/io/fst/transition_table.cuh +++ /dev/null @@ -1,110 +0,0 @@ -/* - * Copyright (c) 2022, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#pragma once - -#include -#include -#include - -#include - -#include - -namespace cudf { -namespace io { -namespace fst { -namespace detail { - -template -class TransitionTable { - private: - // Type used - using ItemT = char; - - struct _TempStorage { - ItemT transitions[MAX_NUM_STATES * MAX_NUM_SYMBOLS]; - }; - - public: - using TempStorage = cub::Uninitialized<_TempStorage>; - - struct KernelParameter { - ItemT transitions[MAX_NUM_STATES * MAX_NUM_SYMBOLS]; - }; - - static void InitDeviceTransitionTable(hostdevice_vector& transition_table_init, - const std::vector>& trans_table, - rmm::cuda_stream_view stream) - { - // trans_table[state][symbol] -> new state - for (std::size_t state = 0; state < trans_table.size(); ++state) { - for (std::size_t symbol = 0; symbol < trans_table[state].size(); ++symbol) { - transition_table_init.host_ptr()->transitions[symbol * MAX_NUM_STATES + state] = - trans_table[state][symbol]; - } - } - - // Copy transition table to device - transition_table_init.host_to_device(stream); - } - - constexpr CUDF_HOST_DEVICE TransitionTable(const KernelParameter& kernel_param, - TempStorage& temp_storage) - : temp_storage(temp_storage.Alias()) - { -#if CUB_PTX_ARCH > 0 - for (int i = threadIdx.x; i < MAX_NUM_STATES * MAX_NUM_SYMBOLS; i += blockDim.x) { - this->temp_storage.transitions[i] = kernel_param.transitions[i]; - } - __syncthreads(); -#else - for (int i = 0; i < MAX_NUM_STATES * MAX_NUM_SYMBOLS; i++) { - this->temp_storage.transitions[i] = kernel_param.transitions[i]; - } -#endif - } - - /** - * @brief Returns a random-access iterator to lookup all the state transitions for one specific - * symbol from an arbitrary old_state, i.e., it[old_state] -> new_state. - * - * @param state_id The DFA's current state index from which we'll transition - * @param match_id The symbol group id of the symbol that we just read in - * @return - */ - template - constexpr CUDF_HOST_DEVICE int32_t operator()(StateIndexT const state_id, - SymbolIndexT const match_id) const - { - return temp_storage.transitions[match_id * MAX_NUM_STATES + state_id]; - } - - private: - _TempStorage& temp_storage; - - __device__ __forceinline__ _TempStorage& PrivateStorage() - { - __shared__ _TempStorage private_storage; - - return private_storage; - } -}; - -} // namespace detail -} // namespace fst -} // namespace io -} // namespace cudf diff --git a/cpp/src/io/fst/translation_table.cuh b/cpp/src/io/fst/translation_table.cuh deleted file mode 100644 index 89da994606c..00000000000 --- a/cpp/src/io/fst/translation_table.cuh +++ /dev/null @@ -1,175 +0,0 @@ -/* - * Copyright (c) 2022, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#pragma once - -#include -#include -#include -#include - -#include "rmm/device_uvector.hpp" - -#include - -#include - -namespace cudf { -namespace io { -namespace fst { -namespace detail { - -/** - * @brief Lookup table mapping (old_state, symbol_group_id) transitions to a sequence of symbols - * that the finite-state transducer is supposed to output for each transition - * - * @tparam OutSymbolT The symbol type being output - * @tparam OutSymbolOffsetT Type sufficiently large to index into the lookup table of output symbols - * @tparam MAX_NUM_SYMBOLS The maximum number of symbols being output by a single state transition - * @tparam MAX_NUM_STATES The maximum number of states that this lookup table shall support - * @tparam MAX_TABLE_SIZE The maximum number of items in the lookup table of output symbols - */ -template -class TransducerLookupTable { - private: - struct _TempStorage { - OutSymbolOffsetT out_offset[MAX_NUM_STATES * MAX_NUM_SYMBOLS + 1]; - OutSymbolT out_symbols[MAX_TABLE_SIZE]; - }; - - public: - using TempStorage = cub::Uninitialized<_TempStorage>; - - struct KernelParameter { - OutSymbolOffsetT d_out_offsets[MAX_NUM_STATES * MAX_NUM_SYMBOLS + 1]; - OutSymbolT d_out_symbols[MAX_TABLE_SIZE]; - }; - - /** - * @brief Initializes the translation table (both the host and device parts) - */ - static void InitDeviceTranslationTable( - hostdevice_vector& translation_table_init, - std::vector>> const& trans_table, - rmm::cuda_stream_view stream) - { - std::vector out_symbols; - out_symbols.reserve(MAX_TABLE_SIZE); - std::vector out_symbol_offsets; - out_symbol_offsets.reserve(MAX_NUM_STATES * MAX_NUM_SYMBOLS + 1); - out_symbol_offsets.push_back(0); - - // Iterate over the states in the transition table - for (auto const& state_trans : trans_table) { - uint32_t num_added = 0; - // Iterate over the symbols in the transition table - for (auto const& symbol_out : state_trans) { - // Insert the output symbols for this specific (state, symbol) transition - out_symbols.insert(std::end(out_symbols), std::begin(symbol_out), std::end(symbol_out)); - out_symbol_offsets.push_back(out_symbols.size()); - num_added++; - } - - // Copy the last offset for all symbols (to guarantee a proper lookup for omitted symbols of - // this state) - if (MAX_NUM_SYMBOLS > num_added) { - int32_t count = MAX_NUM_SYMBOLS - num_added; - auto begin_it = std::prev(std::end(out_symbol_offsets)); - std::copy(begin_it, begin_it + count, std::back_inserter(out_symbol_offsets)); - } - } - - // Check whether runtime-provided table size exceeds the compile-time given max. table size - if (out_symbols.size() > MAX_TABLE_SIZE) { CUDF_FAIL("Unsupported translation table"); } - - // Prepare host-side data to be copied and passed to the device - std::copy(std::cbegin(out_symbol_offsets), - std::cend(out_symbol_offsets), - translation_table_init.host_ptr()->d_out_offsets); - std::copy(std::cbegin(out_symbols), - std::cend(out_symbols), - translation_table_init.host_ptr()->d_out_symbols); - - // Copy data to device - translation_table_init.host_to_device(stream); - } - - private: - _TempStorage& temp_storage; - - __device__ __forceinline__ _TempStorage& PrivateStorage() - { - __shared__ _TempStorage private_storage; - return private_storage; - } - - public: - /** - * @brief Synchronizes the thread block, if called from device, and, hence, requires all threads - * of the thread block to call the constructor - */ - CUDF_HOST_DEVICE TransducerLookupTable(KernelParameter const& kernel_param, - TempStorage& temp_storage) - : temp_storage(temp_storage.Alias()) - { - constexpr uint32_t num_offsets = MAX_NUM_STATES * MAX_NUM_SYMBOLS + 1; -#if CUB_PTX_ARCH > 0 - for (int i = threadIdx.x; i < num_offsets; i += blockDim.x) { - this->temp_storage.out_offset[i] = kernel_param.d_out_offsets[i]; - } - // Make sure all threads in the block can read out_symbol_offsets[num_offsets - 1] from shared - // memory - __syncthreads(); - for (int i = threadIdx.x; i < this->temp_storage.out_offset[num_offsets - 1]; i += blockDim.x) { - this->temp_storage.out_symbols[i] = kernel_param.d_out_symbols[i]; - } - __syncthreads(); -#else - for (int i = 0; i < num_offsets; i++) { - this->temp_storage.out_symbol_offsets[i] = kernel_param.d_out_offsets[i]; - } - for (int i = 0; i < this->temp_storage.out_symbol_offsets[i]; i++) { - this->temp_storage.out_symbols[i] = kernel_param.d_out_symbols[i]; - } -#endif - } - - template - constexpr CUDF_HOST_DEVICE OutSymbolT operator()(StateIndexT const state_id, - SymbolIndexT const match_id, - RelativeOffsetT const relative_offset) const - { - auto offset = temp_storage.out_offset[state_id * MAX_NUM_SYMBOLS + match_id] + relative_offset; - return temp_storage.out_symbols[offset]; - } - - template - constexpr CUDF_HOST_DEVICE OutSymbolOffsetT operator()(StateIndexT const state_id, - SymbolIndexT const match_id) const - { - return temp_storage.out_offset[state_id * MAX_NUM_SYMBOLS + match_id + 1] - - temp_storage.out_offset[state_id * MAX_NUM_SYMBOLS + match_id]; - } -}; - -} // namespace detail -} // namespace fst -} // namespace io -} // namespace cudf diff --git a/cpp/tests/io/fst/fst_test.cu b/cpp/tests/io/fst/fst_test.cu index 29c93a6f3bb..012c37ab842 100644 --- a/cpp/tests/io/fst/fst_test.cu +++ b/cpp/tests/io/fst/fst_test.cu @@ -14,7 +14,7 @@ * limitations under the License. */ -#include +#include #include #include @@ -189,7 +189,7 @@ TEST_F(FstTest, GroundTruth) using SymbolOffsetT = uint32_t; // Helper class to set up transition table, symbol group lookup table, and translation table - using DfaFstT = cudf::io::fst::Dfa; + using DfaFstT = cudf::io::fst::detail::Dfa; // Prepare cuda stream for data transfers & kernels cudaStream_t stream = nullptr; From 39cff8039c160a3de0795a33c4e4fc2215072900 Mon Sep 17 00:00:00 2001 From: Elias Stehle <3958403+elstehle@users.noreply.github.com> Date: Wed, 27 Apr 2022 04:42:31 -0700 Subject: [PATCH 05/38] Change interface for FST to not need temp storage --- cpp/src/io/fst/lookup_tables.cuh | 55 ++++++++++++++++++++------------ cpp/tests/io/fst/fst_test.cu | 39 ++++++++-------------- 2 files changed, 49 insertions(+), 45 deletions(-) diff --git a/cpp/src/io/fst/lookup_tables.cuh b/cpp/src/io/fst/lookup_tables.cuh index 58853919b69..f8921d4091b 100644 --- a/cpp/src/io/fst/lookup_tables.cuh +++ b/cpp/src/io/fst/lookup_tables.cuh @@ -17,8 +17,8 @@ #pragma once #include -#include #include +#include #include @@ -485,26 +485,41 @@ class Dfa { typename TransducedIndexOutItT, typename TransducedCountOutItT, typename OffsetT> - cudaError_t Transduce(void* d_temp_storage, - size_t& temp_storage_bytes, - SymbolT const* d_chars, - OffsetT num_chars, - TransducedOutItT d_out_it, - TransducedIndexOutItT d_out_idx_it, - TransducedCountOutItT d_num_transduced_out_it, - const uint32_t seed_state = 0, - cudaStream_t stream = 0) + void Transduce(SymbolT const* d_chars, + OffsetT num_chars, + TransducedOutItT d_out_it, + TransducedIndexOutItT d_out_idx_it, + TransducedCountOutItT d_num_transduced_out_it, + const uint32_t seed_state, + rmm::cuda_stream_view stream) { - return DeviceTransduce(d_temp_storage, - temp_storage_bytes, - this->get_device_view(), - d_chars, - num_chars, - d_out_it, - d_out_idx_it, - d_num_transduced_out_it, - seed_state, - stream); + std::size_t temp_storage_bytes = 0; + rmm::device_buffer temp_storage{}; + DeviceTransduce(nullptr, + temp_storage_bytes, + this->get_device_view(), + d_chars, + num_chars, + d_out_it, + d_out_idx_it, + d_num_transduced_out_it, + seed_state, + stream); + + if (temp_storage.size() < temp_storage_bytes) { + temp_storage.resize(temp_storage_bytes, stream); + } + + DeviceTransduce(temp_storage.data(), + temp_storage_bytes, + this->get_device_view(), + d_chars, + num_chars, + d_out_it, + d_out_idx_it, + d_num_transduced_out_it, + seed_state, + stream); } private: diff --git a/cpp/tests/io/fst/fst_test.cu b/cpp/tests/io/fst/fst_test.cu index 012c37ab842..9e8011bb7df 100644 --- a/cpp/tests/io/fst/fst_test.cu +++ b/cpp/tests/io/fst/fst_test.cu @@ -217,8 +217,10 @@ TEST_F(FstTest, GroundTruth) input += input; // Prepare input & output buffers + constexpr std::size_t single_item = 1; rmm::device_uvector d_input(input.size(), stream_view); hostdevice_vector output_gpu(input.size(), stream_view); + hostdevice_vector output_gpu_size(single_item, stream_view); hostdevice_vector out_indexes_gpu(input.size(), stream_view); ASSERT_CUDA_SUCCEEDED(cudaMemcpyAsync( d_input.data(), input.data(), input.size() * sizeof(SymbolT), cudaMemcpyHostToDevice, stream)); @@ -228,32 +230,19 @@ TEST_F(FstTest, GroundTruth) std::size_t temp_storage_bytes = 0; - // Query temporary storage requirements - ASSERT_CUDA_SUCCEEDED(parser.Transduce(nullptr, - temp_storage_bytes, - d_input.data(), - static_cast(d_input.size()), - output_gpu.device_ptr(), - out_indexes_gpu.device_ptr(), - cub::DiscardOutputIterator{}, - start_state, - stream)); - // Allocate device-side temporary storage & run algorithm - rmm::device_buffer temp_storage{temp_storage_bytes, stream_view}; - ASSERT_CUDA_SUCCEEDED(parser.Transduce(temp_storage.data(), - temp_storage_bytes, - d_input.data(), - static_cast(d_input.size()), - output_gpu.device_ptr(), - out_indexes_gpu.device_ptr(), - cub::DiscardOutputIterator{}, - start_state, - stream)); + parser.Transduce(d_input.data(), + static_cast(d_input.size()), + output_gpu.device_ptr(), + out_indexes_gpu.device_ptr(), + output_gpu_size.device_ptr(), + start_state, + stream); // Async copy results from device to host output_gpu.device_to_host(stream_view); out_indexes_gpu.device_to_host(stream_view); + output_gpu_size.device_to_host(stream_view); // Prepare CPU-side results for verification std::string output_cpu{}; @@ -275,13 +264,13 @@ TEST_F(FstTest, GroundTruth) cudaStreamSynchronize(stream); // Verify results - ASSERT_EQ(output_gpu.size(), output_cpu.size()); + ASSERT_EQ(output_gpu_size[0], output_cpu.size()); ASSERT_EQ(out_indexes_gpu.size(), out_index_cpu.size()); - for (std::size_t i = 0; i < output_gpu.size(); i++) { - ASSERT_EQ(output_gpu.host_ptr()[i], output_cpu[i]) << "Mismatch at index #" << i; + for (std::size_t i = 0; i < output_cpu.size(); i++) { + ASSERT_EQ(output_gpu[i], output_cpu[i]) << "Mismatch at index #" << i; } for (std::size_t i = 0; i < out_indexes_gpu.size(); i++) { - ASSERT_EQ(out_indexes_gpu.host_ptr()[i], out_index_cpu[i]) << "Mismatch at index #" << i; + ASSERT_EQ(out_indexes_gpu[i], out_index_cpu[i]) << "Mismatch at index #" << i; } } From e24a13301a34fbb08d8424a2ba4edfbebd402d67 Mon Sep 17 00:00:00 2001 From: Elias Stehle <3958403+elstehle@users.noreply.github.com> Date: Wed, 4 May 2022 07:29:00 -0700 Subject: [PATCH 06/38] removing unused var post-cleanup --- cpp/tests/io/fst/fst_test.cu | 2 -- 1 file changed, 2 deletions(-) diff --git a/cpp/tests/io/fst/fst_test.cu b/cpp/tests/io/fst/fst_test.cu index 9e8011bb7df..3d4f68b03c4 100644 --- a/cpp/tests/io/fst/fst_test.cu +++ b/cpp/tests/io/fst/fst_test.cu @@ -228,8 +228,6 @@ TEST_F(FstTest, GroundTruth) // Run algorithm DfaFstT parser{pda_sgs, pda_state_tt, pda_out_tt, stream}; - std::size_t temp_storage_bytes = 0; - // Allocate device-side temporary storage & run algorithm parser.Transduce(d_input.data(), static_cast(d_input.size()), From caf61955c32c57cca287fb9d7e74bf5d0efc8506 Mon Sep 17 00:00:00 2001 From: Elias Stehle <3958403+elstehle@users.noreply.github.com> Date: Mon, 9 May 2022 10:24:51 -0700 Subject: [PATCH 07/38] unified usage of pragma unrolls --- cpp/src/io/fst/in_reg_array.cuh | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/cpp/src/io/fst/in_reg_array.cuh b/cpp/src/io/fst/in_reg_array.cuh index ed5948249d4..3180dbfe132 100644 --- a/cpp/src/io/fst/in_reg_array.cuh +++ b/cpp/src/io/fst/in_reg_array.cuh @@ -59,7 +59,7 @@ class MultiFragmentInRegArray { //------------------------------------------------------------------------------ // HELPER FUNCTIONS //------------------------------------------------------------------------------ - __device__ __host__ __forceinline__ uint32_t bfe(const uint32_t& data, + __host__ __device__ __forceinline__ uint32_t bfe(const uint32_t& data, uint32_t bit_start, uint32_t num_bits) const { @@ -71,7 +71,7 @@ class MultiFragmentInRegArray { #endif } - __device__ __host__ __forceinline__ void bfi(uint32_t& data, + __host__ __device__ __forceinline__ void bfi(uint32_t& data, uint32_t bits, uint32_t bit_start, uint32_t num_bits) const @@ -97,7 +97,6 @@ class MultiFragmentInRegArray { { uint32_t val = 0; - // #pragma unroll for (uint32_t i = 0; i < FRAGMENTS_PER_ITEM; ++i) { val = val | bfe(data[i], index * BITS_PER_FRAG_ITEM, BITS_PER_FRAG_ITEM) << (i * BITS_PER_FRAG_ITEM); @@ -107,7 +106,6 @@ class MultiFragmentInRegArray { __host__ __device__ __forceinline__ void Set(uint32_t index, uint32_t value) { - // #pragma unroll for (uint32_t i = 0; i < FRAGMENTS_PER_ITEM; ++i) { uint32_t frag_bits = bfe(value, i * BITS_PER_FRAG_ITEM, BITS_PER_FRAG_ITEM); bfi(data[i], frag_bits, index * BITS_PER_FRAG_ITEM, BITS_PER_FRAG_ITEM); From ea79a81fb9b0473d37f31c42bce25269a3d17d88 Mon Sep 17 00:00:00 2001 From: Elias Stehle <3958403+elstehle@users.noreply.github.com> Date: Mon, 9 May 2022 10:32:17 -0700 Subject: [PATCH 08/38] Adding hostdevice macros to in-reg array --- cpp/src/io/fst/in_reg_array.cuh | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/cpp/src/io/fst/in_reg_array.cuh b/cpp/src/io/fst/in_reg_array.cuh index 3180dbfe132..1180dc594da 100644 --- a/cpp/src/io/fst/in_reg_array.cuh +++ b/cpp/src/io/fst/in_reg_array.cuh @@ -15,6 +15,8 @@ */ #pragma once +#include + #include #include @@ -59,7 +61,7 @@ class MultiFragmentInRegArray { //------------------------------------------------------------------------------ // HELPER FUNCTIONS //------------------------------------------------------------------------------ - __host__ __device__ __forceinline__ uint32_t bfe(const uint32_t& data, + CUDF_HOST_DEVICE uint32_t bfe(const uint32_t& data, uint32_t bit_start, uint32_t num_bits) const { @@ -71,7 +73,7 @@ class MultiFragmentInRegArray { #endif } - __host__ __device__ __forceinline__ void bfi(uint32_t& data, + CUDF_HOST_DEVICE void bfi(uint32_t& data, uint32_t bits, uint32_t bit_start, uint32_t num_bits) const @@ -93,7 +95,7 @@ class MultiFragmentInRegArray { // ACCESSORS //------------------------------------------------------------------------------ public: - __host__ __device__ __forceinline__ uint32_t Get(int32_t index) const + CUDF_HOST_DEVICE uint32_t Get(int32_t index) const { uint32_t val = 0; @@ -104,7 +106,7 @@ class MultiFragmentInRegArray { return val; } - __host__ __device__ __forceinline__ void Set(uint32_t index, uint32_t value) + CUDF_HOST_DEVICE void Set(uint32_t index, uint32_t value) { for (uint32_t i = 0; i < FRAGMENTS_PER_ITEM; ++i) { uint32_t frag_bits = bfe(value, i * BITS_PER_FRAG_ITEM, BITS_PER_FRAG_ITEM); @@ -115,14 +117,14 @@ class MultiFragmentInRegArray { //------------------------------------------------------------------------------ // CONSTRUCTORS //------------------------------------------------------------------------------ - __host__ __device__ __forceinline__ MultiFragmentInRegArray() + CUDF_HOST_DEVICE MultiFragmentInRegArray() { for (uint32_t i = 0; i < FRAGMENTS_PER_ITEM; ++i) { data[i] = 0; } } - __host__ __device__ __forceinline__ MultiFragmentInRegArray(uint32_t const (&array)[NUM_ITEMS]) + CUDF_HOST_DEVICE MultiFragmentInRegArray(uint32_t const (&array)[NUM_ITEMS]) { for (uint32_t i = 0; i < NUM_ITEMS; ++i) { Set(i, array[i]); From 17dcbfd07b73a64a0a1cfda71ea9c2770b6a8662 Mon Sep 17 00:00:00 2001 From: Elias Stehle <3958403+elstehle@users.noreply.github.com> Date: Mon, 9 May 2022 10:33:00 -0700 Subject: [PATCH 09/38] making const vars const --- cpp/src/io/fst/agent_dfa.cuh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cpp/src/io/fst/agent_dfa.cuh b/cpp/src/io/fst/agent_dfa.cuh index 3bc59160696..aaafd2d7a22 100644 --- a/cpp/src/io/fst/agent_dfa.cuh +++ b/cpp/src/io/fst/agent_dfa.cuh @@ -104,7 +104,7 @@ class DFASimulationCallbackWrapper { StateVectorT const& new_state, SymbolIndexT const& symbol_id) { - uint32_t count = transducer_table(old_state.Get(0), symbol_id); + uint32_t const count = transducer_table(old_state.Get(0), symbol_id); if (write) { for (uint32_t out_char = 0; out_char < count; out_char++) { out_it[out_count + out_char] = transducer_table(old_state.Get(0), symbol_id, out_char); @@ -117,7 +117,7 @@ class DFASimulationCallbackWrapper { __host__ __device__ __forceinline__ void TearDown() {} public: - TransducerTableT transducer_table; + TransducerTableT const transducer_table; TransducedOutItT out_it; TransducedIndexOutItT out_idx_it; uint32_t out_count; From 6fdd24a5625150469242af16fdcb1d549b3676e0 Mon Sep 17 00:00:00 2001 From: Elias Stehle <3958403+elstehle@users.noreply.github.com> Date: Mon, 9 May 2022 12:17:34 -0700 Subject: [PATCH 10/38] refactor lut sanity check --- cpp/src/io/fst/lookup_tables.cuh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/io/fst/lookup_tables.cuh b/cpp/src/io/fst/lookup_tables.cuh index f8921d4091b..3e5504a6208 100644 --- a/cpp/src/io/fst/lookup_tables.cuh +++ b/cpp/src/io/fst/lookup_tables.cuh @@ -343,7 +343,7 @@ class TransducerLookupTable { } // Check whether runtime-provided table size exceeds the compile-time given max. table size - if (out_symbols.size() > MAX_TABLE_SIZE) { CUDF_FAIL("Unsupported translation table"); } + CUDF_EXPECTS(out_symbols.size() <= MAX_TABLE_SIZE, "Unsupported translation table"); // Prepare host-side data to be copied and passed to the device std::copy(std::cbegin(out_symbol_offsets), From eccf9701432f557b52b1f44b985128668bf1462f Mon Sep 17 00:00:00 2001 From: Elias Stehle <3958403+elstehle@users.noreply.github.com> Date: Thu, 2 Jun 2022 05:19:53 -0700 Subject: [PATCH 11/38] fixes sg-count & uses rmm stream in fst tests --- cpp/src/io/fst/lookup_tables.cuh | 10 +++++----- cpp/tests/io/fst/fst_test.cu | 31 ++++++++++++++++--------------- 2 files changed, 21 insertions(+), 20 deletions(-) diff --git a/cpp/src/io/fst/lookup_tables.cuh b/cpp/src/io/fst/lookup_tables.cuh index 3e5504a6208..f245aa76368 100644 --- a/cpp/src/io/fst/lookup_tables.cuh +++ b/cpp/src/io/fst/lookup_tables.cuh @@ -421,8 +421,8 @@ class TransducerLookupTable { * translation table that specifies which state transitions cause which output to be written). * * @tparam OutSymbolT The symbol type being output by the finite-state transducer - * @tparam NUM_SYMBOLS The number of symbol groups amongst which to differentiate (one dimension of - * the transition table) + * @tparam NUM_SYMBOLS The number of symbol groups amongst which to differentiate including the + * wildcard symbol group (one dimension of the transition table) * @tparam NUM_STATES The number of states defined by the DFA (the other dimension of the * transition table) */ @@ -439,16 +439,16 @@ class Dfa { using SymbolGroupIdInitT = typename SymbolGroupIdLookupT::KernelParameter; // Transition table - using TransitionTableT = detail::TransitionTable; + using TransitionTableT = detail::TransitionTable; using TransitionTableInitT = typename TransitionTableT::KernelParameter; // Translation lookup table using OutSymbolOffsetT = uint32_t; using TranslationTableT = detail::TransducerLookupTable; + NUM_SYMBOLS * NUM_STATES>; using TranslationTableInitT = typename TranslationTableT::KernelParameter; auto get_device_view() diff --git a/cpp/tests/io/fst/fst_test.cu b/cpp/tests/io/fst/fst_test.cu index 3d4f68b03c4..8c43aa92455 100644 --- a/cpp/tests/io/fst/fst_test.cu +++ b/cpp/tests/io/fst/fst_test.cu @@ -23,6 +23,7 @@ #include #include +#include #include #include @@ -189,12 +190,10 @@ TEST_F(FstTest, GroundTruth) using SymbolOffsetT = uint32_t; // Helper class to set up transition table, symbol group lookup table, and translation table - using DfaFstT = cudf::io::fst::detail::Dfa; + using DfaFstT = cudf::io::fst::detail::Dfa; // Prepare cuda stream for data transfers & kernels - cudaStream_t stream = nullptr; - cudaStreamCreate(&stream); - rmm::cuda_stream_view stream_view(stream); + rmm::cuda_stream stream{}; // Test input std::string input = R"( {)" @@ -216,17 +215,19 @@ TEST_F(FstTest, GroundTruth) for (std::size_t i = 0; i < 10; i++) input += input; + + // Prepare input & output buffers constexpr std::size_t single_item = 1; - rmm::device_uvector d_input(input.size(), stream_view); - hostdevice_vector output_gpu(input.size(), stream_view); - hostdevice_vector output_gpu_size(single_item, stream_view); - hostdevice_vector out_indexes_gpu(input.size(), stream_view); + rmm::device_uvector d_input(input.size(), stream.view()); + hostdevice_vector output_gpu(input.size(), stream.view()); + hostdevice_vector output_gpu_size(single_item, stream.view()); + hostdevice_vector out_indexes_gpu(input.size(), stream.view()); ASSERT_CUDA_SUCCEEDED(cudaMemcpyAsync( - d_input.data(), input.data(), input.size() * sizeof(SymbolT), cudaMemcpyHostToDevice, stream)); + d_input.data(), input.data(), input.size() * sizeof(SymbolT), cudaMemcpyHostToDevice, stream.value())); // Run algorithm - DfaFstT parser{pda_sgs, pda_state_tt, pda_out_tt, stream}; + DfaFstT parser{pda_sgs, pda_state_tt, pda_out_tt, stream.value()}; // Allocate device-side temporary storage & run algorithm parser.Transduce(d_input.data(), @@ -235,12 +236,12 @@ TEST_F(FstTest, GroundTruth) out_indexes_gpu.device_ptr(), output_gpu_size.device_ptr(), start_state, - stream); + stream.value()); // Async copy results from device to host - output_gpu.device_to_host(stream_view); - out_indexes_gpu.device_to_host(stream_view); - output_gpu_size.device_to_host(stream_view); + output_gpu.device_to_host(stream.view()); + out_indexes_gpu.device_to_host(stream.view()); + output_gpu_size.device_to_host(stream.view()); // Prepare CPU-side results for verification std::string output_cpu{}; @@ -259,7 +260,7 @@ TEST_F(FstTest, GroundTruth) std::back_inserter(out_index_cpu)); // Make sure results have been copied back to host - cudaStreamSynchronize(stream); + cudaStreamSynchronize(stream.value()); // Verify results ASSERT_EQ(output_gpu_size[0], output_cpu.size()); From 9fe8e4b6e2c527e471d9627369e72595ef3e452c Mon Sep 17 00:00:00 2001 From: Elias Stehle <3958403+elstehle@users.noreply.github.com> Date: Tue, 14 Jun 2022 03:12:35 -0700 Subject: [PATCH 12/38] minor doxygen fix --- cpp/src/io/fst/device_dfa.cuh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/io/fst/device_dfa.cuh b/cpp/src/io/fst/device_dfa.cuh index d3f0e8be213..10c3b9ac814 100644 --- a/cpp/src/io/fst/device_dfa.cuh +++ b/cpp/src/io/fst/device_dfa.cuh @@ -29,8 +29,8 @@ namespace fst { * @brief Uses a deterministic finite automaton to transduce a sequence of symbols from an input * iterator to a sequence of transduced output symbols. * - * @tparam SymbolItT Random-access input iterator type to symbols fed into the FST * @tparam DfaT The DFA specification + * @tparam SymbolItT Random-access input iterator type to symbols fed into the FST * @tparam TransducedOutItT Random-access output iterator to which the transduced output will be * written * @tparam TransducedIndexOutItT Random-access output iterator type to which the indexes of the From 694a365448a2156d0a1c60fafcd52f67c5f0c3f6 Mon Sep 17 00:00:00 2001 From: Elias Stehle <3958403+elstehle@users.noreply.github.com> Date: Wed, 15 Jun 2022 04:28:51 -0700 Subject: [PATCH 13/38] adopts suggested fst test changes --- cpp/tests/io/fst/fst_test.cu | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/cpp/tests/io/fst/fst_test.cu b/cpp/tests/io/fst/fst_test.cu index 8c43aa92455..b6d5327bb59 100644 --- a/cpp/tests/io/fst/fst_test.cu +++ b/cpp/tests/io/fst/fst_test.cu @@ -20,15 +20,16 @@ #include #include +#include +#include #include +#include #include #include #include #include -#include "cub/cub.cuh" - #include #include @@ -212,8 +213,13 @@ TEST_F(FstTest, GroundTruth) R"(} {} [] [ ])"; // Repeat input sample 1024x - for (std::size_t i = 0; i < 10; i++) - input += input; + size_t string_size = 1 << 10; + auto d_input_scalar = cudf::make_string_scalar(input); + auto& d_string_scalar = static_cast(*d_input_scalar); + const cudf::size_type repeat_times = string_size / input.size(); + auto d_input_string = cudf::strings::repeat_string(d_string_scalar, repeat_times); + auto& d_input = static_cast&>(*d_input_string); + input = d_input.to_string(stream); @@ -260,7 +266,7 @@ TEST_F(FstTest, GroundTruth) std::back_inserter(out_index_cpu)); // Make sure results have been copied back to host - cudaStreamSynchronize(stream.value()); + stream.synchronize(); // Verify results ASSERT_EQ(output_gpu_size[0], output_cpu.size()); From f656f494e39f628dee12706ee2a0e9c6ea180126 Mon Sep 17 00:00:00 2001 From: Elias Stehle <3958403+elstehle@users.noreply.github.com> Date: Thu, 7 Jul 2022 02:41:16 -0700 Subject: [PATCH 14/38] adopts device-side test data gen --- cpp/tests/io/fst/fst_test.cu | 34 +++++++++------------------------- 1 file changed, 9 insertions(+), 25 deletions(-) diff --git a/cpp/tests/io/fst/fst_test.cu b/cpp/tests/io/fst/fst_test.cu index b6d5327bb59..0c337be61f3 100644 --- a/cpp/tests/io/fst/fst_test.cu +++ b/cpp/tests/io/fst/fst_test.cu @@ -26,7 +26,6 @@ #include #include -#include #include #include @@ -91,28 +90,18 @@ static std::pair fst_baseline(InputItT begin, auto const& symbol = *it; std::size_t symbol_group = 0; - bool found = false; // Iterate over symbol groups and search for the first symbol group containing the current // symbol for (auto const& sg : symbol_group_lut) { - for (auto const& s : sg) - if (s == symbol) found = true; - if (found) break; + if (std::find(std::cbegin(sg), std::cend(sg), symbol) != std::cend(sg)) { break; } symbol_group++; } // Output the translated symbols to the output tape - size_t inserted = 0; for (auto out : translation_table[state][symbol_group]) { - // std::cout << in_offset << ": " << out << "\n"; *out_tape = out; ++out_tape; - inserted++; - } - - // Output the index of the current symbol, iff it caused some output to be written - if (inserted > 0) { *out_index_tape = in_offset; out_index_tape++; } @@ -120,6 +109,7 @@ static std::pair fst_baseline(InputItT begin, // Transition the state of the finite-state machine state = transition_table[state][symbol_group]; + // Continue with next symbol from input tape in_offset++; } return {out_tape, out_index_tape}; @@ -195,10 +185,11 @@ TEST_F(FstTest, GroundTruth) // Prepare cuda stream for data transfers & kernels rmm::cuda_stream stream{}; + rmm::cuda_stream_view stream_view(stream); // Test input std::string input = R"( {)" - R"(category": "reference",)" + R"("category": "reference",)" R"("index:" [4,12,42],)" R"("author": "Nigel Rees",)" R"("title": "Sayings of the Century",)" @@ -212,8 +203,7 @@ TEST_F(FstTest, GroundTruth) R"("price": 8.95)" R"(} {} [] [ ])"; - // Repeat input sample 1024x - size_t string_size = 1 << 10; + size_t string_size = input.size() * (1 << 10); auto d_input_scalar = cudf::make_string_scalar(input); auto& d_string_scalar = static_cast(*d_input_scalar); const cudf::size_type repeat_times = string_size / input.size(); @@ -221,16 +211,11 @@ TEST_F(FstTest, GroundTruth) auto& d_input = static_cast&>(*d_input_string); input = d_input.to_string(stream); - - // Prepare input & output buffers constexpr std::size_t single_item = 1; - rmm::device_uvector d_input(input.size(), stream.view()); - hostdevice_vector output_gpu(input.size(), stream.view()); - hostdevice_vector output_gpu_size(single_item, stream.view()); - hostdevice_vector out_indexes_gpu(input.size(), stream.view()); - ASSERT_CUDA_SUCCEEDED(cudaMemcpyAsync( - d_input.data(), input.data(), input.size() * sizeof(SymbolT), cudaMemcpyHostToDevice, stream.value())); + hostdevice_vector output_gpu(input.size(), stream_view); + hostdevice_vector output_gpu_size(single_item, stream_view); + hostdevice_vector out_indexes_gpu(input.size(), stream_view); // Run algorithm DfaFstT parser{pda_sgs, pda_state_tt, pda_out_tt, stream.value()}; @@ -270,11 +255,10 @@ TEST_F(FstTest, GroundTruth) // Verify results ASSERT_EQ(output_gpu_size[0], output_cpu.size()); - ASSERT_EQ(out_indexes_gpu.size(), out_index_cpu.size()); for (std::size_t i = 0; i < output_cpu.size(); i++) { ASSERT_EQ(output_gpu[i], output_cpu[i]) << "Mismatch at index #" << i; } - for (std::size_t i = 0; i < out_indexes_gpu.size(); i++) { + for (std::size_t i = 0; i < output_cpu.size(); i++) { ASSERT_EQ(out_indexes_gpu[i], out_index_cpu[i]) << "Mismatch at index #" << i; } } From 485a1c632bc6fe03030c445b8cff06dc1c3ca32f Mon Sep 17 00:00:00 2001 From: Elias Stehle <3958403+elstehle@users.noreply.github.com> Date: Fri, 8 Jul 2022 22:49:57 -0700 Subject: [PATCH 15/38] adopts c++17 namespaces declarations --- cpp/src/io/fst/agent_dfa.cuh | 10 ++-------- cpp/src/io/fst/device_dfa.cuh | 8 ++------ cpp/src/io/fst/dispatch_dfa.cuh | 10 ++-------- cpp/src/io/fst/in_reg_array.cuh | 20 ++++++-------------- cpp/src/io/fst/lookup_tables.cuh | 10 ++-------- 5 files changed, 14 insertions(+), 44 deletions(-) diff --git a/cpp/src/io/fst/agent_dfa.cuh b/cpp/src/io/fst/agent_dfa.cuh index aaafd2d7a22..f641cd3e053 100644 --- a/cpp/src/io/fst/agent_dfa.cuh +++ b/cpp/src/io/fst/agent_dfa.cuh @@ -19,10 +19,7 @@ #include -namespace cudf { -namespace io { -namespace fst { -namespace detail { +namespace cudf::io::fst::detail { //----------------------------------------------------------------------------- // STATE VECTOR @@ -715,7 +712,4 @@ __launch_bounds__(int32_t(AgentDFAPolicy::BLOCK_THREADS)) __global__ } } -} // namespace detail -} // namespace fst -} // namespace io -} // namespace cudf +} // namespace cudf::io::fst::detail diff --git a/cpp/src/io/fst/device_dfa.cuh b/cpp/src/io/fst/device_dfa.cuh index 10c3b9ac814..56afc32e643 100644 --- a/cpp/src/io/fst/device_dfa.cuh +++ b/cpp/src/io/fst/device_dfa.cuh @@ -21,9 +21,7 @@ #include -namespace cudf { -namespace io { -namespace fst { +namespace cudf::io::fst { /** * @brief Uses a deterministic finite automaton to transduce a sequence of symbols from an input @@ -93,6 +91,4 @@ cudaError_t DeviceTransduce(void* d_temp_storage, stream); } -} // namespace fst -} // namespace io -} // namespace cudf +} // namespace cudf::io::fst diff --git a/cpp/src/io/fst/dispatch_dfa.cuh b/cpp/src/io/fst/dispatch_dfa.cuh index 316d6ea0d5f..6de1757bfed 100644 --- a/cpp/src/io/fst/dispatch_dfa.cuh +++ b/cpp/src/io/fst/dispatch_dfa.cuh @@ -22,10 +22,7 @@ #include -namespace cudf { -namespace io { -namespace fst { -namespace detail { +namespace cudf::io::fst::detail { /** * @brief The tuning policy comprising all the architecture-specific compile-time tuning parameters. @@ -456,7 +453,4 @@ struct DispatchFSM : DeviceFSMPolicy { sm_count, stv_tile_state, fst_offset_tile_state, seed_state, d_thread_state_transition); } }; -} // namespace detail -} // namespace fst -} // namespace io -} // namespace cudf +} // namespace cudf::io::fst::detail diff --git a/cpp/src/io/fst/in_reg_array.cuh b/cpp/src/io/fst/in_reg_array.cuh index 1180dc594da..9d59d04cb9b 100644 --- a/cpp/src/io/fst/in_reg_array.cuh +++ b/cpp/src/io/fst/in_reg_array.cuh @@ -21,10 +21,7 @@ #include -namespace cudf { -namespace io { -namespace fst { -namespace detail { +namespace cudf::io::fst::detail { /** * @brief A bit-packed array of items that can be backed by registers yet allows to be dynamically @@ -61,9 +58,7 @@ class MultiFragmentInRegArray { //------------------------------------------------------------------------------ // HELPER FUNCTIONS //------------------------------------------------------------------------------ - CUDF_HOST_DEVICE uint32_t bfe(const uint32_t& data, - uint32_t bit_start, - uint32_t num_bits) const + CUDF_HOST_DEVICE uint32_t bfe(const uint32_t& data, uint32_t bit_start, uint32_t num_bits) const { #if CUB_PTX_ARCH > 0 return cub::BFE(data, bit_start, num_bits); @@ -74,9 +69,9 @@ class MultiFragmentInRegArray { } CUDF_HOST_DEVICE void bfi(uint32_t& data, - uint32_t bits, - uint32_t bit_start, - uint32_t num_bits) const + uint32_t bits, + uint32_t bit_start, + uint32_t num_bits) const { #if CUB_PTX_ARCH > 0 cub::BFI(data, data, bits, bit_start, num_bits); @@ -132,7 +127,4 @@ class MultiFragmentInRegArray { } }; -} // namespace detail -} // namespace fst -} // namespace io -} // namespace cudf +} // namespace cudf::io::fst::detail diff --git a/cpp/src/io/fst/lookup_tables.cuh b/cpp/src/io/fst/lookup_tables.cuh index f245aa76368..f1dd31d1f4b 100644 --- a/cpp/src/io/fst/lookup_tables.cuh +++ b/cpp/src/io/fst/lookup_tables.cuh @@ -26,10 +26,7 @@ #include #include -namespace cudf { -namespace io { -namespace fst { -namespace detail { +namespace cudf::io::fst::detail { /** * @brief Class template that can be plugged into the finite-state machine to look up the symbol @@ -528,7 +525,4 @@ class Dfa { hostdevice_vector translation_table_init{}; }; -} // namespace detail -} // namespace fst -} // namespace io -} // namespace cudf +} // namespace cudf::io::fst::detail From 5f1c4b544882f1d35ac8701eb611e8f64c12ac56 Mon Sep 17 00:00:00 2001 From: Elias Stehle <3958403+elstehle@users.noreply.github.com> Date: Mon, 11 Jul 2022 06:26:47 -0700 Subject: [PATCH 16/38] removes state vector-wrapper in favor of vanilla array --- cpp/src/io/fst/agent_dfa.cuh | 152 ++++++++++++----------------------- 1 file changed, 51 insertions(+), 101 deletions(-) diff --git a/cpp/src/io/fst/agent_dfa.cuh b/cpp/src/io/fst/agent_dfa.cuh index f641cd3e053..65918a33e5e 100644 --- a/cpp/src/io/fst/agent_dfa.cuh +++ b/cpp/src/io/fst/agent_dfa.cuh @@ -21,35 +21,6 @@ namespace cudf::io::fst::detail { -//----------------------------------------------------------------------------- -// STATE VECTOR -//----------------------------------------------------------------------------- -/** - * @brief A vector is able to hold multiple state indices (e.g., to represent multiple DFA - * instances, where the i-th item would represent the i-th DFA instance). - * - * @tparam StateIndexT Signed or unsigned type used to index items inside the vector - * @tparam NUM_ITEMS The number of items to be allocated for a vector - */ -template -class MultiItemStateVector { - public: - template - __host__ __device__ __forceinline__ void Set(IndexT index, StateIndexT value) noexcept - { - state_[index] = value; - } - - template - __host__ __device__ __forceinline__ StateIndexT Get(IndexT index) const noexcept - { - return state_[index]; - } - - private: - StateIndexT state_[NUM_ITEMS]; -}; - //----------------------------------------------------------------------------- // DFA-SIMULATION STATE COMPOSITION FUNCTORS //----------------------------------------------------------------------------- @@ -57,10 +28,15 @@ class MultiItemStateVector { * @brief Implements an associative composition operation for state transition vectors and * offset-to-overap vectors to be used with a prefix scan. * + * Read the following table as follows: c = op(l,r), where op is the composition operator. + * For row 0: l maps 0 to 2. r maps 2 to 2. Hence, the result for 0 is 2. + * For row 1: l maps 1 to 1. r maps 1 to 2. Hence, the result for 1 is 2. + * For row 2: l maps 2 to 0. r maps 0 to 1. Hence, the result for 2 is 1. + * * l r = c ( s->l->r) * 0: [2] [1] [2] (i.e. 0->2->2) * 1: [1] [2] [2] (i.e. 1->1->2) - * 2: [0] [2] [1] (i.e. 2->0->2) + * 2: [0] [2] [1] (i.e. 2->0->1) * @tparam NUM_ITEMS The number of items stored within a vector */ template @@ -68,7 +44,7 @@ struct VectorCompositeOp { template __host__ __device__ __forceinline__ VectorT operator()(VectorT const& lhs, VectorT const& rhs) { - VectorT res; + VectorT res{}; for (int32_t i = 0; i < NUM_ITEMS; ++i) { res.Set(i, rhs.Get(lhs.Get(i))); } @@ -95,16 +71,16 @@ class DFASimulationCallbackWrapper { if (!write) out_count = 0; } - template - __host__ __device__ __forceinline__ void ReadSymbol(CharIndexT const& character_index, - StateVectorT const& old_state, - StateVectorT const& new_state, - SymbolIndexT const& symbol_id) + template + __host__ __device__ __forceinline__ void ReadSymbol(CharIndexT const character_index, + StateIndexT const old_state, + StateIndexT const new_state, + SymbolIndexT const symbol_id) { - uint32_t const count = transducer_table(old_state.Get(0), symbol_id); + uint32_t const count = transducer_table(old_state, symbol_id); if (write) { for (uint32_t out_char = 0; out_char < count; out_char++) { - out_it[out_count + out_char] = transducer_table(old_state.Get(0), symbol_id, out_char); + out_it[out_count + out_char] = transducer_table(old_state, symbol_id, out_char); out_idx_it[out_count + out_char] = offset + character_index; } } @@ -125,22 +101,11 @@ class DFASimulationCallbackWrapper { //----------------------------------------------------------------------------- // STATE-TRANSITION CALLBACKS //----------------------------------------------------------------------------- -class StateTransitionCallbackOp { +template +class StateVectorTransitionOp { public: - template - __host__ __device__ __forceinline__ void ReadSymbol(CharIndexT const& character_index, - SymbolIndexT const& read_symbol_id) const - { - } -}; -/// Type alias for a state transition callback class that performs no operation on any callback -using NoOpStateTransitionOp = StateTransitionCallbackOp; - -template -class StateVectorTransitionOp : public StateTransitionCallbackOp { - public: - __host__ __device__ __forceinline__ - StateVectorTransitionOp(TransitionTableT const& transition_table, StateVectorT& state_vector) + __host__ __device__ __forceinline__ StateVectorTransitionOp( + TransitionTableT const& transition_table, std::array& state_vector) : transition_table(transition_table), state_vector(state_vector) { } @@ -150,39 +115,37 @@ class StateVectorTransitionOp : public StateTransitionCallbackOp { SymbolIndexT const read_symbol_id) const { for (int32_t i = 0; i < NUM_INSTANCES; ++i) { - state_vector.Set(i, transition_table(state_vector.Get(i), read_symbol_id)); + state_vector[i] = transition_table(state_vector[i], read_symbol_id); } } public: - StateVectorT& state_vector; + std::array& state_vector; const TransitionTableT& transition_table; }; -template +template struct StateTransitionOp { - StateVectorT old_state_vector; - StateVectorT state_vector; + int32_t state; const TransitionTableT& transition_table; CallbackOpT& callback_op; - __host__ __device__ __forceinline__ StateTransitionOp(const TransitionTableT& transition_table, - StateVectorT state_vector, + __host__ __device__ __forceinline__ StateTransitionOp(TransitionTableT const& transition_table, + int32_t state, CallbackOpT& callback_op) - : transition_table(transition_table), - state_vector(state_vector), - old_state_vector(state_vector), - callback_op(callback_op) + : transition_table(transition_table), state(state), callback_op(callback_op) { } template - __host__ __device__ __forceinline__ void ReadSymbol(const CharIndexT& character_index, - const SymbolIndexT& read_symbol_id) + __host__ __device__ __forceinline__ void ReadSymbol(CharIndexT const& character_index, + SymbolIndexT const& read_symbol_id) { - old_state_vector = state_vector; - state_vector.Set(0, transition_table(state_vector.Get(0), read_symbol_id)); - callback_op.ReadSymbol(character_index, old_state_vector, state_vector, read_symbol_id); + // Remember what state we were in before we made the transition + int32_t previous_state = state; + + state = transition_table(state, read_symbol_id); + callback_op.ReadSymbol(character_index, previous_state, state, read_symbol_id); } }; @@ -237,9 +200,6 @@ struct AgentDFA { { } - //--------------------------------------------------------------------- - // STATIC PARSING PRIMITIVES - //--------------------------------------------------------------------- template /*IS_FULL_BLOCK*/) { - uint32_t matched_id; - // Iterate over symbols #pragma unroll for (int32_t i = 0; i < NUM_SYMBOLS; ++i) { if (IS_FULL_BLOCK || threadIdx.x * SYMBOLS_PER_THREAD + i < max_num_chars) { - matched_id = symbol_matcher(chars[i]); + uint32_t matched_id = symbol_matcher(chars[i]); callback_op.ReadSymbol(i, matched_id); } } @@ -400,20 +358,16 @@ struct AgentDFA { } } - template + template __device__ __forceinline__ void GetThreadStateTransitionVector( const SymbolMatcherT& symbol_matcher, const TransitionTableT& transition_table, const CharT* d_chars, const OffsetT block_offset, const OffsetT num_total_symbols, - StateVectorT& state_vector) + std::array& state_vector) { - using StateVectorTransitionOpT = - StateVectorTransitionOp; + using StateVectorTransitionOpT = StateVectorTransitionOp; // Start parsing and to transition states StateVectorTransitionOpT transition_op(transition_table, state_vector); @@ -439,14 +393,11 @@ struct AgentDFA { GetThreadStateTransitions( symbol_matcher, t_chars, num_block_chars, transition_op, cub::Int2Type()); } - - // transition_op.TearDown(); } template __device__ __forceinline__ void GetThreadStateTransitions( SymbolMatcherT const& symbol_matcher, @@ -454,14 +405,14 @@ struct AgentDFA { CharT const* d_chars, OffsetT const block_offset, OffsetT const num_total_symbols, - StateVectorT& state_vector, + int32_t& state, CallbackOpT& callback_op, cub::Int2Type /**/) { - using StateTransitionOpT = StateTransitionOp; + using StateTransitionOpT = StateTransitionOp; // Start parsing and to transition states - StateTransitionOpT transition_op(transition_table, state_vector, callback_op); + StateTransitionOpT transition_op(transition_table, state, callback_op); // Load characters into shared memory if (!BYPASS_LOAD) LoadBlock(d_chars, block_offset, num_total_symbols); @@ -528,7 +479,7 @@ __launch_bounds__(int32_t(AgentDFAPolicy::BLOCK_THREADS)) __global__ SYMBOLS_PER_BLOCK = AgentDfaSimT::SYMBOLS_PER_BLOCK }; - // Shared memory required by the DFA simulator + // Shared memory required by the DFA simulation algorithm __shared__ typename AgentDfaSimT::TempStorage dfa_storage; // Shared memory required by the symbol group lookup table @@ -552,18 +503,18 @@ __launch_bounds__(int32_t(AgentDFAPolicy::BLOCK_THREADS)) __global__ // Set up DFA AgentDfaSimT agent_dfa(dfa_storage); - // Memory is the state transition vector passed on to the second stage of the algorithm + // The state transition vector passed on to the second stage of the algorithm StateVectorT out_state_vector; // Stage 1: Compute the state-transition vector if (IS_TRANS_VECTOR_PASS || IS_SINGLE_PASS) { - // StateVectorT state_vector; - MultiItemStateVector state_vector; + // Keeping track of the state for each of the state machines + std::array state_vector; // Initialize the seed state transition vector with the identity vector #pragma unroll for (int32_t i = 0; i < NUM_STATES; ++i) { - state_vector.Set(i, i); + state_vector[i] = i; } // Compute the state transition vector @@ -577,7 +528,7 @@ __launch_bounds__(int32_t(AgentDFAPolicy::BLOCK_THREADS)) __global__ // Initialize the state transition vector passed on to the second stage #pragma unroll for (int32_t i = 0; i < NUM_STATES; ++i) { - out_state_vector.Set(i, state_vector.Get(i)); + out_state_vector.Set(i, state_vector[i]); } // Write out state-transition vector @@ -585,10 +536,10 @@ __launch_bounds__(int32_t(AgentDFAPolicy::BLOCK_THREADS)) __global__ d_thread_state_transition[blockIdx.x * BLOCK_THREADS + threadIdx.x] = out_state_vector; } } + // Stage 2: Perform FSM simulation if ((!IS_TRANS_VECTOR_PASS) || IS_SINGLE_PASS) { - constexpr uint32_t SINGLE_ITEM_COUNT = 1; - MultiItemStateVector state; + int32_t state = 0; //------------------------------------------------------------------------------ // SINGLE-PASS: @@ -637,10 +588,9 @@ __launch_bounds__(int32_t(AgentDFAPolicy::BLOCK_THREADS)) __global__ .ExclusiveScan(out_state_vector, out_state_vector, state_vector_scan_op, prefix_op); } __syncthreads(); - state.Set(0, out_state_vector.Get(seed_state)); + state = out_state_vector.Get(seed_state); } else { - state.Set( - 0, d_thread_state_transition[blockIdx.x * BLOCK_THREADS + threadIdx.x].Get(seed_state)); + state = d_thread_state_transition[blockIdx.x * BLOCK_THREADS + threadIdx.x].Get(seed_state); } // Perform finite-state machine simulation, computing size of transduced output @@ -649,8 +599,7 @@ __launch_bounds__(int32_t(AgentDFAPolicy::BLOCK_THREADS)) __global__ TransducedIndexOutItT> callback_wrapper(transducer_table, transduced_out_it, transduced_out_idx_it); - MultiItemStateVector t_start_state; - t_start_state.Set(0, state.Get(seed_state)); + int32_t t_start_state = state; agent_dfa.GetThreadStateTransitions(symbol_matcher, transition_table, d_chars, @@ -661,6 +610,7 @@ __launch_bounds__(int32_t(AgentDFAPolicy::BLOCK_THREADS)) __global__ cub::Int2Type()); __syncthreads(); + using OffsetPrefixScanCallbackOpT_ = cub::TilePrefixCallbackOp; From e6f8defa0b79d040eb465cb76a12af194d1ff899 Mon Sep 17 00:00:00 2001 From: Elias Stehle <3958403+elstehle@users.noreply.github.com> Date: Mon, 11 Jul 2022 09:06:01 -0700 Subject: [PATCH 17/38] some west-const remainders & unifies StateIndexT --- cpp/src/io/fst/agent_dfa.cuh | 107 +++++++++++++++++------------------ 1 file changed, 53 insertions(+), 54 deletions(-) diff --git a/cpp/src/io/fst/agent_dfa.cuh b/cpp/src/io/fst/agent_dfa.cuh index 65918a33e5e..36de79a0757 100644 --- a/cpp/src/io/fst/agent_dfa.cuh +++ b/cpp/src/io/fst/agent_dfa.cuh @@ -21,6 +21,9 @@ namespace cudf::io::fst::detail { +/// Type used to enumerate (and index) into the states defined by a DFA +using StateIndexT = uint32_t; + //----------------------------------------------------------------------------- // DFA-SIMULATION STATE COMPOSITION FUNCTORS //----------------------------------------------------------------------------- @@ -105,7 +108,7 @@ template class StateVectorTransitionOp { public: __host__ __device__ __forceinline__ StateVectorTransitionOp( - TransitionTableT const& transition_table, std::array& state_vector) + TransitionTableT const& transition_table, std::array& state_vector) : transition_table(transition_table), state_vector(state_vector) { } @@ -120,18 +123,18 @@ class StateVectorTransitionOp { } public: - std::array& state_vector; - const TransitionTableT& transition_table; + std::array& state_vector; + TransitionTableT const& transition_table; }; template struct StateTransitionOp { - int32_t state; - const TransitionTableT& transition_table; + StateIndexT state; + TransitionTableT const& transition_table; CallbackOpT& callback_op; __host__ __device__ __forceinline__ StateTransitionOp(TransitionTableT const& transition_table, - int32_t state, + StateIndexT state, CallbackOpT& callback_op) : transition_table(transition_table), state(state), callback_op(callback_op) { @@ -142,7 +145,7 @@ struct StateTransitionOp { SymbolIndexT const& read_symbol_id) { // Remember what state we were in before we made the transition - int32_t previous_state = state; + StateIndexT previous_state = state; state = transition_table(state, read_symbol_id); callback_op.ReadSymbol(character_index, previous_state, state, read_symbol_id); @@ -152,7 +155,6 @@ struct StateTransitionOp { template struct AgentDFA { using SymbolIndexT = uint32_t; - using StateIndexT = uint32_t; using AliasedLoadT = uint32_t; using CharT = typename std::iterator_traits::value_type; @@ -200,14 +202,13 @@ struct AgentDFA { { } - template - __device__ __forceinline__ static void ThreadParse(const SymbolMatcherT& symbol_matcher, - const CharT* chars, - const SymbolIndexT& max_num_chars, + __device__ __forceinline__ static void ThreadParse(SymbolMatcherT const& symbol_matcher, + CharT const* chars, + SymbolIndexT const& max_num_chars, CallbackOpT callback_op, cub::Int2Type /*IS_FULL_BLOCK*/) { @@ -215,7 +216,7 @@ struct AgentDFA { #pragma unroll for (int32_t i = 0; i < NUM_SYMBOLS; ++i) { if (IS_FULL_BLOCK || threadIdx.x * SYMBOLS_PER_THREAD + i < max_num_chars) { - uint32_t matched_id = symbol_matcher(chars[i]); + auto matched_id = symbol_matcher(chars[i]); callback_op.ReadSymbol(i, matched_id); } } @@ -226,9 +227,9 @@ struct AgentDFA { typename StateTransitionOpT, int32_t IS_FULL_BLOCK> __device__ __forceinline__ void GetThreadStateTransitions( - const SymbolMatcherT& symbol_matcher, - const CharT* chars, - const SymbolIndexT& max_num_chars, + SymbolMatcherT const& symbol_matcher, + CharT const* chars, + SymbolIndexT const& max_num_chars, StateTransitionOpT& state_transition_op, cub::Int2Type /*IS_FULL_BLOCK*/) { @@ -239,15 +240,15 @@ struct AgentDFA { //--------------------------------------------------------------------- // LOADING FULL BLOCK OF CHARACTERS, NON-ALIASED //--------------------------------------------------------------------- - __device__ __forceinline__ void LoadBlock(const CharT* d_chars, - const OffsetT block_offset, - const OffsetT num_total_symbols, + __device__ __forceinline__ void LoadBlock(CharT const* d_chars, + OffsetT const block_offset, + OffsetT const num_total_symbols, cub::Int2Type /*IS_FULL_BLOCK*/, cub::Int2Type<1> /*ALIGNMENT*/) { CharT thread_chars[SYMBOLS_PER_THREAD]; - const CharT* d_block_symbols = d_chars + block_offset; + CharT const* d_block_symbols = d_chars + block_offset; cub::LoadDirectStriped(threadIdx.x, d_block_symbols, thread_chars); #pragma unroll @@ -259,9 +260,9 @@ struct AgentDFA { //--------------------------------------------------------------------- // LOADING PARTIAL BLOCK OF CHARACTERS, NON-ALIASED //--------------------------------------------------------------------- - __device__ __forceinline__ void LoadBlock(const CharT* d_chars, - const OffsetT block_offset, - const OffsetT num_total_symbols, + __device__ __forceinline__ void LoadBlock(CharT const* d_chars, + OffsetT const block_offset, + OffsetT const num_total_symbols, cub::Int2Type /*IS_FULL_BLOCK*/, cub::Int2Type<1> /*ALIGNMENT*/) { @@ -272,7 +273,7 @@ struct AgentDFA { // Last unit to be loaded is IDIV_CEIL(#SYM, SYMBOLS_PER_UNIT) OffsetT num_total_chars = num_total_symbols - block_offset; - const CharT* d_block_symbols = d_chars + block_offset; + CharT const* d_block_symbols = d_chars + block_offset; cub::LoadDirectStriped( threadIdx.x, d_block_symbols, thread_chars, num_total_chars); @@ -285,16 +286,16 @@ struct AgentDFA { //--------------------------------------------------------------------- // LOADING FULL BLOCK OF CHARACTERS, ALIASED //--------------------------------------------------------------------- - __device__ __forceinline__ void LoadBlock(const CharT* d_chars, - const OffsetT block_offset, - const OffsetT num_total_symbols, + __device__ __forceinline__ void LoadBlock(CharT const* d_chars, + OffsetT const block_offset, + OffsetT const num_total_symbols, cub::Int2Type /*IS_FULL_BLOCK*/, cub::Int2Type /*ALIGNMENT*/) { AliasedLoadT thread_units[UINTS_PER_THREAD]; - const AliasedLoadT* d_block_symbols = - reinterpret_cast(d_chars + block_offset); + AliasedLoadT const* d_block_symbols = + reinterpret_cast(d_chars + block_offset); cub::LoadDirectStriped(threadIdx.x, d_block_symbols, thread_units); #pragma unroll @@ -306,9 +307,9 @@ struct AgentDFA { //--------------------------------------------------------------------- // LOADING PARTIAL BLOCK OF CHARACTERS, ALIASED //--------------------------------------------------------------------- - __device__ __forceinline__ void LoadBlock(const CharT* d_chars, - const OffsetT block_offset, - const OffsetT num_total_symbols, + __device__ __forceinline__ void LoadBlock(CharT const* d_chars, + OffsetT const block_offset, + OffsetT const num_total_symbols, cub::Int2Type /*IS_FULL_BLOCK*/, cub::Int2Type /*ALIGNMENT*/) { @@ -320,8 +321,8 @@ struct AgentDFA { OffsetT num_total_units = CUB_QUOTIENT_CEILING(num_total_symbols - block_offset, sizeof(AliasedLoadT)); - const AliasedLoadT* d_block_symbols = - reinterpret_cast(d_chars + block_offset); + AliasedLoadT const* d_block_symbols = + reinterpret_cast(d_chars + block_offset); cub::LoadDirectStriped( threadIdx.x, d_block_symbols, thread_units, num_total_units); @@ -334,9 +335,9 @@ struct AgentDFA { //--------------------------------------------------------------------- // LOADING BLOCK OF CHARACTERS: DISPATCHER //--------------------------------------------------------------------- - __device__ __forceinline__ void LoadBlock(const CharT* d_chars, - const OffsetT block_offset, - const OffsetT num_total_symbols) + __device__ __forceinline__ void LoadBlock(CharT const* d_chars, + OffsetT const block_offset, + OffsetT const num_total_symbols) { // Check if pointer is aligned to four bytes if (((uintptr_t)(const void*)(d_chars + block_offset) % 4) == 0) { @@ -360,12 +361,12 @@ struct AgentDFA { template __device__ __forceinline__ void GetThreadStateTransitionVector( - const SymbolMatcherT& symbol_matcher, - const TransitionTableT& transition_table, - const CharT* d_chars, - const OffsetT block_offset, - const OffsetT num_total_symbols, - std::array& state_vector) + SymbolMatcherT const& symbol_matcher, + TransitionTableT const& transition_table, + CharT const* d_chars, + OffsetT const block_offset, + OffsetT const num_total_symbols, + std::array& state_vector) { using StateVectorTransitionOpT = StateVectorTransitionOp; @@ -405,7 +406,7 @@ struct AgentDFA { CharT const* d_chars, OffsetT const block_offset, OffsetT const num_total_symbols, - int32_t& state, + StateIndexT& state, CallbackOpT& callback_op, cub::Int2Type /**/) { @@ -459,7 +460,7 @@ __launch_bounds__(int32_t(AgentDFAPolicy::BLOCK_THREADS)) __global__ void SimulateDFAKernel(DfaT dfa, SymbolItT d_chars, OffsetT const num_chars, - uint32_t seed_state, + StateIndexT seed_state, StateVectorT* __restrict__ d_thread_state_transition, TileStateT tile_state, OutOffsetScanTileState offset_tile_state, @@ -467,11 +468,9 @@ __launch_bounds__(int32_t(AgentDFAPolicy::BLOCK_THREADS)) __global__ TransducedIndexOutItT transduced_out_idx_it, TransducedCountOutItT d_num_transduced_out_it) { - using StateIndexT = uint32_t; - using AgentDfaSimT = AgentDFA; - static constexpr uint32_t NUM_STATES = DfaT::MAX_NUM_STATES; + static constexpr int32_t NUM_STATES = DfaT::MAX_NUM_STATES; enum { BLOCK_THREADS = AgentDFAPolicy::BLOCK_THREADS, @@ -509,7 +508,7 @@ __launch_bounds__(int32_t(AgentDFAPolicy::BLOCK_THREADS)) __global__ // Stage 1: Compute the state-transition vector if (IS_TRANS_VECTOR_PASS || IS_SINGLE_PASS) { // Keeping track of the state for each of the state machines - std::array state_vector; + std::array state_vector; // Initialize the seed state transition vector with the identity vector #pragma unroll @@ -539,7 +538,7 @@ __launch_bounds__(int32_t(AgentDFAPolicy::BLOCK_THREADS)) __global__ // Stage 2: Perform FSM simulation if ((!IS_TRANS_VECTOR_PASS) || IS_SINGLE_PASS) { - int32_t state = 0; + StateIndexT state = 0; //------------------------------------------------------------------------------ // SINGLE-PASS: @@ -599,7 +598,7 @@ __launch_bounds__(int32_t(AgentDFAPolicy::BLOCK_THREADS)) __global__ TransducedIndexOutItT> callback_wrapper(transducer_table, transduced_out_it, transduced_out_idx_it); - int32_t t_start_state = state; + StateIndexT t_start_state = state; agent_dfa.GetThreadStateTransitions(symbol_matcher, transition_table, d_chars, From a798852ea24e44a8432847fae82f4009cce20c05 Mon Sep 17 00:00:00 2001 From: Elias Stehle <3958403+elstehle@users.noreply.github.com> Date: Mon, 11 Jul 2022 11:00:22 -0700 Subject: [PATCH 18/38] adds check for state transition narrowing conversion --- cpp/src/io/fst/agent_dfa.cuh | 29 ++++++++++++++++++++--------- cpp/src/io/fst/lookup_tables.cuh | 6 +++++- cpp/tests/io/fst/fst_test.cu | 31 +++++++++++++++---------------- 3 files changed, 40 insertions(+), 26 deletions(-) diff --git a/cpp/src/io/fst/agent_dfa.cuh b/cpp/src/io/fst/agent_dfa.cuh index 36de79a0757..7eff0c50024 100644 --- a/cpp/src/io/fst/agent_dfa.cuh +++ b/cpp/src/io/fst/agent_dfa.cuh @@ -24,9 +24,6 @@ namespace cudf::io::fst::detail { /// Type used to enumerate (and index) into the states defined by a DFA using StateIndexT = uint32_t; -//----------------------------------------------------------------------------- -// DFA-SIMULATION STATE COMPOSITION FUNCTORS -//----------------------------------------------------------------------------- /** * @brief Implements an associative composition operation for state transition vectors and * offset-to-overap vectors to be used with a prefix scan. @@ -55,9 +52,18 @@ struct VectorCompositeOp { } }; -//----------------------------------------------------------------------------- -// DFA-SIMULATION CALLBACK WRAPPERS/HELPERS -//----------------------------------------------------------------------------- +/** + * @brief A class whose ReadSymbol member function is invoked for each symbol being read from the + * input tape. The wrapper class looks up whether a state transition caused by a symbol is supposed + * to emit any output symbol (the "transduced" output) and, if so, keeps track of how many symbols + * it intends to write out and writing out such symbols to the given output iterators. + * + * @tparam TransducerTableT The type implementing a transducer table that can be used for looking up + * the symbols that are supposed to be emitted on a given state transition. + * @tparam TransducedOutItT A Random-access output iterator type to which symbols returned by the + * transducer table are assignable. + * @tparam TransducedIndexOutItT A Random-access output iterator type to which indexes are written. + */ template class DFASimulationCallbackWrapper { public: @@ -101,9 +107,14 @@ class DFASimulationCallbackWrapper { bool write; }; -//----------------------------------------------------------------------------- -// STATE-TRANSITION CALLBACKS -//----------------------------------------------------------------------------- +/** + * @brief Helper class that transitions the state of multiple DFA instances simultaneously whenever + * a symbol is read. + * + * @tparam NUM_INSTANCES The number of DFA instances to keep track of + * @tparam TransitionTableT The transition table type used for looking up the new state for a + * current_state and a read_symbol. + */ template class StateVectorTransitionOp { public: diff --git a/cpp/src/io/fst/lookup_tables.cuh b/cpp/src/io/fst/lookup_tables.cuh index f1dd31d1f4b..a0a9f81a302 100644 --- a/cpp/src/io/fst/lookup_tables.cuh +++ b/cpp/src/io/fst/lookup_tables.cuh @@ -168,13 +168,17 @@ class TransitionTable { ItemT transitions[MAX_NUM_STATES * MAX_NUM_SYMBOLS]; }; + template ()})>> static void InitDeviceTransitionTable(hostdevice_vector& transition_table_init, - const std::vector>& trans_table, + std::vector> const& trans_table, rmm::cuda_stream_view stream) { // trans_table[state][symbol] -> new state for (std::size_t state = 0; state < trans_table.size(); ++state) { for (std::size_t symbol = 0; symbol < trans_table[state].size(); ++symbol) { + CUDF_EXPECTS( + trans_table[state][symbol] <= std::numeric_limits::max(), + "Target state index value exceeds value representable by the transition table's type"); transition_table_init.host_ptr()->transitions[symbol * MAX_NUM_STATES + state] = trans_table[state][symbol]; } diff --git a/cpp/tests/io/fst/fst_test.cu b/cpp/tests/io/fst/fst_test.cu index 0c337be61f3..e1ee655f30b 100644 --- a/cpp/tests/io/fst/fst_test.cu +++ b/cpp/tests/io/fst/fst_test.cu @@ -118,21 +118,20 @@ static std::pair fst_baseline(InputItT begin, //------------------------------------------------------------------------------ // TEST FST SPECIFICATIONS //------------------------------------------------------------------------------ -// FST to check for brackets and braces outside of pairs of quotes -// The state being active while being outside of a string. When encountering an opening bracket -// or curly brace, we push it onto the stack. When encountering a closing bracket or brace, we -// pop it from the stack. -constexpr uint32_t TT_OOS = 0U; - -// The state being active while being within a string (e.g., field name or a string value). We do -// not push or pop from the stack while being in this state. -constexpr uint32_t TT_STR = 1U; - -// The state being active after encountering an escape symbol (e.g., '\') while being in the TT_STR -// state. constexpr uint32_t TT_ESC = 2U; // cmt to avoid 'unused' warning - -// Total number of states -constexpr uint32_t TT_NUM_STATES = 3U; +enum DFA_STATES : char { + // The state being active while being outside of a string. When encountering an opening bracket or + // curly brace, we push it onto the stack. When encountering a closing bracket or brace, we pop it + // from the stack. + TT_OOS = 0U, + // The state being active while being within a string (e.g., field name or a string value). We do + // not push or pop from the stack while being in this state. + TT_STR, + // The state being active after encountering an escape symbol (e.g., '\') while being in the + // TT_STR state. + TT_ESC [[maybe_unused]], + // Total number of states + TT_NUM_STATES +}; // Definition of the symbol groups enum PDA_SG_ID { @@ -147,7 +146,7 @@ enum PDA_SG_ID { }; // Transition table -const std::vector> pda_state_tt = { +const std::vector> pda_state_tt = { /* IN_STATE { [ } ] " \ OTHER */ /* TT_OOS */ {TT_OOS, TT_OOS, TT_OOS, TT_OOS, TT_STR, TT_OOS, TT_OOS}, /* TT_STR */ {TT_STR, TT_STR, TT_STR, TT_STR, TT_OOS, TT_STR, TT_STR}, From eb2496205ce65c808e968348c22e35862bb19ff7 Mon Sep 17 00:00:00 2001 From: Elias Stehle <3958403+elstehle@users.noreply.github.com> Date: Tue, 12 Jul 2022 04:52:36 -0700 Subject: [PATCH 19/38] fixes logical stack test includes --- cpp/tests/io/fst/logical_stack_test.cu | 1 - 1 file changed, 1 deletion(-) diff --git a/cpp/tests/io/fst/logical_stack_test.cu b/cpp/tests/io/fst/logical_stack_test.cu index 3c2cdd7fb5c..d76ff16f85d 100644 --- a/cpp/tests/io/fst/logical_stack_test.cu +++ b/cpp/tests/io/fst/logical_stack_test.cu @@ -22,7 +22,6 @@ #include #include -#include #include #include From f52e61457b2b88b8b6a4f61bfd214283ea2d28a9 Mon Sep 17 00:00:00 2001 From: Elias Stehle <3958403+elstehle@users.noreply.github.com> Date: Thu, 14 Jul 2022 09:16:18 -0700 Subject: [PATCH 20/38] replaces enum with typed constexpr --- cpp/src/io/fst/agent_dfa.cuh | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/cpp/src/io/fst/agent_dfa.cuh b/cpp/src/io/fst/agent_dfa.cuh index 7eff0c50024..788e455592b 100644 --- a/cpp/src/io/fst/agent_dfa.cuh +++ b/cpp/src/io/fst/agent_dfa.cuh @@ -483,11 +483,8 @@ __launch_bounds__(int32_t(AgentDFAPolicy::BLOCK_THREADS)) __global__ static constexpr int32_t NUM_STATES = DfaT::MAX_NUM_STATES; - enum { - BLOCK_THREADS = AgentDFAPolicy::BLOCK_THREADS, - ITEMS_PER_THREAD = AgentDFAPolicy::ITEMS_PER_THREAD, - SYMBOLS_PER_BLOCK = AgentDfaSimT::SYMBOLS_PER_BLOCK - }; + constexpr uint32_t BLOCK_THREADS = AgentDFAPolicy::BLOCK_THREADS; + constexpr uint32_t SYMBOLS_PER_BLOCK = AgentDfaSimT::SYMBOLS_PER_BLOCK; // Shared memory required by the DFA simulation algorithm __shared__ typename AgentDfaSimT::TempStorage dfa_storage; From 3038058e48347ef95a18fe2f18190c5e0de7c9a0 Mon Sep 17 00:00:00 2001 From: Elias Stehle <3958403+elstehle@users.noreply.github.com> Date: Thu, 14 Jul 2022 09:17:09 -0700 Subject: [PATCH 21/38] adds excplitis error checking --- cpp/src/io/fst/dispatch_dfa.cuh | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/cpp/src/io/fst/dispatch_dfa.cuh b/cpp/src/io/fst/dispatch_dfa.cuh index 6de1757bfed..57e62608841 100644 --- a/cpp/src/io/fst/dispatch_dfa.cuh +++ b/cpp/src/io/fst/dispatch_dfa.cuh @@ -168,7 +168,7 @@ struct DispatchFSM : DeviceFSMPolicy { // Get PTX version int ptx_version; error = cub::PtxVersion(ptx_version); - if (error) return error; + if (error != cudaSuccess) return error; // Create dispatch functor DispatchFSM dispatch(d_temp_storage, @@ -310,15 +310,15 @@ struct DispatchFSM : DeviceFSMPolicy { cudaError_t error = cudaSuccess; // Get SM count - int device_ordinal; - int sm_count; + int device_ordinal = -1; + int sm_count = -1; // Get current device error = cudaGetDevice(&device_ordinal); - if (error) + if (error != cudaSuccess)return error; - error = cudaDeviceGetAttribute(&sm_count, cudaDevAttrMultiProcessorCount, device_ordinal); - if (error) return error; + error = cudaDeviceGetAttribute(&sm_count, cudaDevAttrMultiProcessorCount, device_ordinal); + if (error != cudaSuccess) return error; //------------------------------------------------------------------------------ // DERIVED TYPEDEFS @@ -377,21 +377,21 @@ struct DispatchFSM : DeviceFSMPolicy { // Bytes needed for tile status descriptors (fusing state-transition vector + DFA simulation) if (SINGLE_PASS_STV) { error = ScanTileStateT::AllocationSize(num_blocks, allocation_sizes[MEM_SINGLE_PASS_STV]); - if (error) return error; + if (error != cudaSuccess) return error; } // Bytes needed for tile status descriptors (DFA simulation pass for output size computation + // output-generating pass) if (IS_FST) { error = FstScanTileStateT::AllocationSize(num_blocks, allocation_sizes[MEM_FST_OFFSET]); - if (error) return error; + if (error != cudaSuccess) return error; } // Alias the temporary allocations from the single storage blob (or compute the necessary size // of the blob) error = cub::AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes); - if (error) return error; + if (error != cudaSuccess) return error; // Return if the caller is simply requesting the size of the storage allocation if (d_temp_storage == NULL) return cudaSuccess; @@ -408,7 +408,7 @@ struct DispatchFSM : DeviceFSMPolicy { // Construct the tile status (aliases memory internally et al.) error = fst_offset_tile_state.Init( num_blocks, allocations[MEM_FST_OFFSET], allocation_sizes[MEM_FST_OFFSET]); - if (error) return error; + if (error != cudaSuccess) return error; constexpr uint32_t FST_INIT_TPB = 256; uint32_t num_fst_init_blocks = CUB_QUOTIENT_CEILING(num_blocks, FST_INIT_TPB); initialization_pass_kernel<<>>( @@ -423,7 +423,7 @@ struct DispatchFSM : DeviceFSMPolicy { // Construct the tile status (aliases memory internally et al.) error = stv_tile_state.Init( num_blocks, allocations[MEM_SINGLE_PASS_STV], allocation_sizes[MEM_SINGLE_PASS_STV]); - if (error) return error; + if (error != cudaSuccess) return error; constexpr uint32_t STV_INIT_TPB = 256; uint32_t num_stv_init_blocks = CUB_QUOTIENT_CEILING(num_blocks, STV_INIT_TPB); initialization_pass_kernel<<>>(stv_tile_state, From d351e5c4197acf7c7ab215ea7555926cb2d1f5b8 Mon Sep 17 00:00:00 2001 From: Elias Stehle <3958403+elstehle@users.noreply.github.com> Date: Thu, 14 Jul 2022 09:17:59 -0700 Subject: [PATCH 22/38] addresses style review comments & fixes a todo --- cpp/src/io/fst/lookup_tables.cuh | 30 ++++++++++-------------------- 1 file changed, 10 insertions(+), 20 deletions(-) diff --git a/cpp/src/io/fst/lookup_tables.cuh b/cpp/src/io/fst/lookup_tables.cuh index a0a9f81a302..208890d28d3 100644 --- a/cpp/src/io/fst/lookup_tables.cuh +++ b/cpp/src/io/fst/lookup_tables.cuh @@ -41,7 +41,7 @@ class SingleSymbolSmemLUT { // Type used for representing a symbol group id (i.e., what we return for a given symbol) using SymbolGroupIdT = uint8_t; - /// Number of entries for every lookup (e.g., for 8-bit Symbol this is 256) + // Number of entries for every lookup (e.g., for 8-bit Symbol this is 256) static constexpr uint32_t NUM_ENTRIES_PER_LUT = 0x01U << (sizeof(SymbolT) * 8U); struct _TempStorage { @@ -60,9 +60,6 @@ class SingleSymbolSmemLUT { using TempStorage = cub::Uninitialized<_TempStorage>; - //------------------------------------------------------------------------------ - // HELPER METHODS - //------------------------------------------------------------------------------ /** * @brief * @@ -104,21 +101,14 @@ class SingleSymbolSmemLUT { sgid_init.host_ptr()->sym_to_sgid[max_base_match_val + 1] = no_match_id; // Alias memory / return memory requiremenets - // TODO I think this could be +1? - sgid_init.host_ptr()->num_valid_entries = max_base_match_val + 2; + sgid_init.host_ptr()->num_valid_entries = max_base_match_val + 1; sgid_init.host_to_device(stream); } - //------------------------------------------------------------------------------ - // MEMBER VARIABLES - //------------------------------------------------------------------------------ _TempStorage& temp_storage; SymbolGroupIdT num_valid_entries; - //------------------------------------------------------------------------------ - // CONSTRUCTOR - //------------------------------------------------------------------------------ __device__ __forceinline__ _TempStorage& PrivateStorage() { __shared__ _TempStorage private_storage; @@ -170,17 +160,17 @@ class TransitionTable { template ()})>> static void InitDeviceTransitionTable(hostdevice_vector& transition_table_init, - std::vector> const& trans_table, + std::vector> const& translation_table, rmm::cuda_stream_view stream) { - // trans_table[state][symbol] -> new state - for (std::size_t state = 0; state < trans_table.size(); ++state) { - for (std::size_t symbol = 0; symbol < trans_table[state].size(); ++symbol) { + // translation_table[state][symbol] -> new state + for (std::size_t state = 0; state < translation_table.size(); ++state) { + for (std::size_t symbol = 0; symbol < translation_table[state].size(); ++symbol) { CUDF_EXPECTS( - trans_table[state][symbol] <= std::numeric_limits::max(), + translation_table[state][symbol] <= std::numeric_limits::max(), "Target state index value exceeds value representable by the transition table's type"); transition_table_init.host_ptr()->transitions[symbol * MAX_NUM_STATES + state] = - trans_table[state][symbol]; + translation_table[state][symbol]; } } @@ -314,7 +304,7 @@ class TransducerLookupTable { */ static void InitDeviceTranslationTable( hostdevice_vector& translation_table_init, - std::vector>> const& trans_table, + std::vector>> const& translation_table, rmm::cuda_stream_view stream) { std::vector out_symbols; @@ -324,7 +314,7 @@ class TransducerLookupTable { out_symbol_offsets.push_back(0); // Iterate over the states in the transition table - for (auto const& state_trans : trans_table) { + for (auto const& state_trans : translation_table) { uint32_t num_added = 0; // Iterate over the symbols in the transition table for (auto const& symbol_out : state_trans) { From 3f479528b2b24b6eddae4d622509f8392b97eb0d Mon Sep 17 00:00:00 2001 From: Elias Stehle <3958403+elstehle@users.noreply.github.com> Date: Thu, 14 Jul 2022 09:22:03 -0700 Subject: [PATCH 23/38] replaces gtest asserts with expects --- cpp/tests/io/fst/fst_test.cu | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cpp/tests/io/fst/fst_test.cu b/cpp/tests/io/fst/fst_test.cu index e1ee655f30b..5d169cd9ac1 100644 --- a/cpp/tests/io/fst/fst_test.cu +++ b/cpp/tests/io/fst/fst_test.cu @@ -255,10 +255,10 @@ TEST_F(FstTest, GroundTruth) // Verify results ASSERT_EQ(output_gpu_size[0], output_cpu.size()); for (std::size_t i = 0; i < output_cpu.size(); i++) { - ASSERT_EQ(output_gpu[i], output_cpu[i]) << "Mismatch at index #" << i; + EXPECT_EQ(output_gpu[i], output_cpu[i]) << "Mismatch at index #" << i; } for (std::size_t i = 0; i < output_cpu.size(); i++) { - ASSERT_EQ(out_indexes_gpu[i], out_index_cpu[i]) << "Mismatch at index #" << i; + EXPECT_EQ(out_indexes_gpu[i], out_index_cpu[i]) << "Mismatch at index #" << i; } } From cba16196b356ecc807a6ae67a20b357677cf26a4 Mon Sep 17 00:00:00 2001 From: Elias Stehle <3958403+elstehle@users.noreply.github.com> Date: Thu, 14 Jul 2022 09:31:12 -0700 Subject: [PATCH 24/38] fixes style in dispatch dfa --- cpp/src/io/fst/dispatch_dfa.cuh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cpp/src/io/fst/dispatch_dfa.cuh b/cpp/src/io/fst/dispatch_dfa.cuh index 57e62608841..2a5594c383d 100644 --- a/cpp/src/io/fst/dispatch_dfa.cuh +++ b/cpp/src/io/fst/dispatch_dfa.cuh @@ -311,11 +311,11 @@ struct DispatchFSM : DeviceFSMPolicy { // Get SM count int device_ordinal = -1; - int sm_count = -1; + int sm_count = -1; // Get current device error = cudaGetDevice(&device_ordinal); - if (error != cudaSuccess)return error; + if (error != cudaSuccess) return error; error = cudaDeviceGetAttribute(&sm_count, cudaDevAttrMultiProcessorCount, device_ordinal); if (error != cudaSuccess) return error; From bea2a02226314cddb6073726d2feafa21d89bb52 Mon Sep 17 00:00:00 2001 From: Elias Stehle <3958403+elstehle@users.noreply.github.com> Date: Fri, 15 Jul 2022 01:54:20 -0700 Subject: [PATCH 25/38] replaces vanilla loop with iota --- cpp/src/io/fst/agent_dfa.cuh | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/cpp/src/io/fst/agent_dfa.cuh b/cpp/src/io/fst/agent_dfa.cuh index 788e455592b..77f04555dc8 100644 --- a/cpp/src/io/fst/agent_dfa.cuh +++ b/cpp/src/io/fst/agent_dfa.cuh @@ -19,6 +19,9 @@ #include +#include +#include + namespace cudf::io::fst::detail { /// Type used to enumerate (and index) into the states defined by a DFA @@ -519,10 +522,7 @@ __launch_bounds__(int32_t(AgentDFAPolicy::BLOCK_THREADS)) __global__ std::array state_vector; // Initialize the seed state transition vector with the identity vector -#pragma unroll - for (int32_t i = 0; i < NUM_STATES; ++i) { - state_vector[i] = i; - } + thrust::sequence(thrust::seq, std::begin(state_vector), std::end(state_vector)); // Compute the state transition vector agent_dfa.GetThreadStateTransitionVector(symbol_matcher, From 8a184e97ce14a400f70d463f7e55f95f32d7a547 Mon Sep 17 00:00:00 2001 From: Elias Stehle <3958403+elstehle@users.noreply.github.com> Date: Fri, 15 Jul 2022 22:51:18 -0700 Subject: [PATCH 26/38] rephrases documentation on in-reg array --- cpp/src/io/fst/in_reg_array.cuh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/io/fst/in_reg_array.cuh b/cpp/src/io/fst/in_reg_array.cuh index 9d59d04cb9b..352d7871699 100644 --- a/cpp/src/io/fst/in_reg_array.cuh +++ b/cpp/src/io/fst/in_reg_array.cuh @@ -51,7 +51,7 @@ class MultiFragmentInRegArray { static constexpr uint32_t BITS_PER_FRAG_ITEM = 0x01U << (cub::Log2<(AVAIL_BITS_PER_FRAG_ITEM + 1)>::VALUE - 1); - // Number of fragments required to store and to reconstruct each item + // The total number of fragments required to store all the items static constexpr uint32_t FRAGMENTS_PER_ITEM = (MIN_BITS_PER_ITEM + BITS_PER_FRAG_ITEM - 1) / BITS_PER_FRAG_ITEM; From 4783aae2aabbd03f9a439822ddd02d0328b5d52a Mon Sep 17 00:00:00 2001 From: Elias Stehle <3958403+elstehle@users.noreply.github.com> Date: Wed, 20 Jul 2022 06:28:05 -0700 Subject: [PATCH 27/38] improves style in fst test --- cpp/tests/io/fst/fst_test.cu | 26 +++++++++++--------------- 1 file changed, 11 insertions(+), 15 deletions(-) diff --git a/cpp/tests/io/fst/fst_test.cu b/cpp/tests/io/fst/fst_test.cu index 5d169cd9ac1..606fe6e7bcd 100644 --- a/cpp/tests/io/fst/fst_test.cu +++ b/cpp/tests/io/fst/fst_test.cu @@ -89,22 +89,18 @@ static std::pair fst_baseline(InputItT begin, // The symbol currently being read auto const& symbol = *it; - std::size_t symbol_group = 0; - // Iterate over symbol groups and search for the first symbol group containing the current - // symbol - for (auto const& sg : symbol_group_lut) { - if (std::find(std::cbegin(sg), std::cend(sg), symbol) != std::cend(sg)) { break; } - symbol_group++; - } + // symbol, if no match is found we use cend(symbol_group_lut) as the "catch-all" symbol group + auto symbol_group_it = std::find_if(std::cbegin(symbol_group_lut), std::cend(symbol_group_lut), + [symbol](auto& sg) { + return std::find(std::cbegin(sg), std::cend(sg), symbol) != std::cend(sg); + }); + auto symbol_group = std::distance(std::cbegin(symbol_group_lut), symbol_group_it); // Output the translated symbols to the output tape - for (auto out : translation_table[state][symbol_group]) { - *out_tape = out; - ++out_tape; - *out_index_tape = in_offset; - out_index_tape++; - } + out_tape = std::copy(std::cbegin(translation_table[state][symbol_group]), std::cend(translation_table[state][symbol_group]), out_tape); + auto out_size = std::distance(std::cbegin(translation_table[state][symbol_group]), std::cend(translation_table[state][symbol_group])); + out_index_tape = std::fill_n(out_index_tape, out_size, in_offset); // Transition the state of the finite-state machine state = transition_table[state][symbol_group]; @@ -128,7 +124,7 @@ enum DFA_STATES : char { TT_STR, // The state being active after encountering an escape symbol (e.g., '\') while being in the // TT_STR state. - TT_ESC [[maybe_unused]], + TT_ESC, // Total number of states TT_NUM_STATES }; @@ -149,7 +145,7 @@ enum PDA_SG_ID { const std::vector> pda_state_tt = { /* IN_STATE { [ } ] " \ OTHER */ /* TT_OOS */ {TT_OOS, TT_OOS, TT_OOS, TT_OOS, TT_STR, TT_OOS, TT_OOS}, - /* TT_STR */ {TT_STR, TT_STR, TT_STR, TT_STR, TT_OOS, TT_STR, TT_STR}, + /* TT_STR */ {TT_STR, TT_STR, TT_STR, TT_STR, TT_OOS, TT_ESC, TT_STR}, /* TT_ESC */ {TT_STR, TT_STR, TT_STR, TT_STR, TT_STR, TT_STR, TT_STR}}; // Translation table (i.e., for each transition, what are the symbols that we output) From 6203709bcfad35d745eb5cae77ddcf6166fa8216 Mon Sep 17 00:00:00 2001 From: Elias Stehle <3958403+elstehle@users.noreply.github.com> Date: Wed, 20 Jul 2022 10:30:56 -0700 Subject: [PATCH 28/38] adds comments in in_reg array --- cpp/src/io/fst/in_reg_array.cuh | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/cpp/src/io/fst/in_reg_array.cuh b/cpp/src/io/fst/in_reg_array.cuh index 352d7871699..f0866ebe3bb 100644 --- a/cpp/src/io/fst/in_reg_array.cuh +++ b/cpp/src/io/fst/in_reg_array.cuh @@ -15,6 +15,7 @@ */ #pragma once +#include #include #include @@ -52,12 +53,14 @@ class MultiFragmentInRegArray { 0x01U << (cub::Log2<(AVAIL_BITS_PER_FRAG_ITEM + 1)>::VALUE - 1); // The total number of fragments required to store all the items - static constexpr uint32_t FRAGMENTS_PER_ITEM = - (MIN_BITS_PER_ITEM + BITS_PER_FRAG_ITEM - 1) / BITS_PER_FRAG_ITEM; + static constexpr uint32_t FRAGMENTS_PER_ITEM = cudf::util::div_rounding_up_safe(MIN_BITS_PER_ITEM, BITS_PER_FRAG_ITEM); //------------------------------------------------------------------------------ // HELPER FUNCTIONS //------------------------------------------------------------------------------ + /** + * @brief Returns the \p num_bits bits starting at \p bit_start + */ CUDF_HOST_DEVICE uint32_t bfe(const uint32_t& data, uint32_t bit_start, uint32_t num_bits) const { #if CUB_PTX_ARCH > 0 @@ -68,6 +71,9 @@ class MultiFragmentInRegArray { #endif } +/** + * @brief Replaces the \p num_bits bits in \p data starting from \p bit_start with the lower \p num_bits from \p bits. + */ CUDF_HOST_DEVICE void bfi(uint32_t& data, uint32_t bits, uint32_t bit_start, From ad5817a4a9b3f12127926f95edd958d9fdb1a4d7 Mon Sep 17 00:00:00 2001 From: Elias Stehle <3958403+elstehle@users.noreply.github.com> Date: Wed, 20 Jul 2022 10:34:59 -0700 Subject: [PATCH 29/38] adds comments to lookup tables --- cpp/src/io/fst/lookup_tables.cuh | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/cpp/src/io/fst/lookup_tables.cuh b/cpp/src/io/fst/lookup_tables.cuh index 208890d28d3..279baef939d 100644 --- a/cpp/src/io/fst/lookup_tables.cuh +++ b/cpp/src/io/fst/lookup_tables.cuh @@ -31,7 +31,7 @@ namespace cudf::io::fst::detail { /** * @brief Class template that can be plugged into the finite-state machine to look up the symbol * group index for a given symbol. Class template does not support multi-symbol lookups (i.e., no - * look-ahead). + * look-ahead). The class uses shared memory for the lookups. * * @tparam SymbolT The symbol type being passed in to lookup the corresponding symbol group id */ @@ -61,7 +61,7 @@ class SingleSymbolSmemLUT { using TempStorage = cub::Uninitialized<_TempStorage>; /** - * @brief + * @brief Initializes the given \p sgid_init with the symbol group lookups defined by \p symbol_strings. * * @param[out] sgid_init A hostdevice_vector that will be populated * @param[in] symbol_strings Array of strings, where the i-th string holds all symbols @@ -128,9 +128,7 @@ class SingleSymbolSmemLUT { #else // CPU-side init - for (std::size_t i = 0; i < kernel_param.num_luts; i++) { - this->temp_storage.sym_to_sgid[i] = kernel_param.sym_to_sgid[i]; - } + std::copy_n(kernel_param.sym_to_sgid, kernel_param.num_luts, this->temp_storage.sym_to_sgid); #endif } @@ -271,7 +269,7 @@ class dfa_device_view { /** * @brief Lookup table mapping (old_state, symbol_group_id) transitions to a sequence of symbols - * that the finite-state transducer is supposed to output for each transition + * that the finite-state transducer is supposed to output for each transition. The class uses shared memory for the lookups. * * @tparam OutSymbolT The symbol type being output * @tparam OutSymbolOffsetT Type sufficiently large to index into the lookup table of output symbols From dc5565359d4aaa92a68d81ff3a74674be2240e15 Mon Sep 17 00:00:00 2001 From: Elias Stehle <3958403+elstehle@users.noreply.github.com> Date: Wed, 20 Jul 2022 12:03:59 -0700 Subject: [PATCH 30/38] fixes formatting --- cpp/src/io/fst/in_reg_array.cuh | 10 ++++++---- cpp/src/io/fst/lookup_tables.cuh | 6 ++++-- 2 files changed, 10 insertions(+), 6 deletions(-) diff --git a/cpp/src/io/fst/in_reg_array.cuh b/cpp/src/io/fst/in_reg_array.cuh index f0866ebe3bb..87ea2d3325f 100644 --- a/cpp/src/io/fst/in_reg_array.cuh +++ b/cpp/src/io/fst/in_reg_array.cuh @@ -53,7 +53,8 @@ class MultiFragmentInRegArray { 0x01U << (cub::Log2<(AVAIL_BITS_PER_FRAG_ITEM + 1)>::VALUE - 1); // The total number of fragments required to store all the items - static constexpr uint32_t FRAGMENTS_PER_ITEM = cudf::util::div_rounding_up_safe(MIN_BITS_PER_ITEM, BITS_PER_FRAG_ITEM); + static constexpr uint32_t FRAGMENTS_PER_ITEM = + cudf::util::div_rounding_up_safe(MIN_BITS_PER_ITEM, BITS_PER_FRAG_ITEM); //------------------------------------------------------------------------------ // HELPER FUNCTIONS @@ -71,9 +72,10 @@ class MultiFragmentInRegArray { #endif } -/** - * @brief Replaces the \p num_bits bits in \p data starting from \p bit_start with the lower \p num_bits from \p bits. - */ + /** + * @brief Replaces the \p num_bits bits in \p data starting from \p bit_start with the lower \p + * num_bits from \p bits. + */ CUDF_HOST_DEVICE void bfi(uint32_t& data, uint32_t bits, uint32_t bit_start, diff --git a/cpp/src/io/fst/lookup_tables.cuh b/cpp/src/io/fst/lookup_tables.cuh index 279baef939d..98ef49d893d 100644 --- a/cpp/src/io/fst/lookup_tables.cuh +++ b/cpp/src/io/fst/lookup_tables.cuh @@ -61,7 +61,8 @@ class SingleSymbolSmemLUT { using TempStorage = cub::Uninitialized<_TempStorage>; /** - * @brief Initializes the given \p sgid_init with the symbol group lookups defined by \p symbol_strings. + * @brief Initializes the given \p sgid_init with the symbol group lookups defined by \p + * symbol_strings. * * @param[out] sgid_init A hostdevice_vector that will be populated * @param[in] symbol_strings Array of strings, where the i-th string holds all symbols @@ -269,7 +270,8 @@ class dfa_device_view { /** * @brief Lookup table mapping (old_state, symbol_group_id) transitions to a sequence of symbols - * that the finite-state transducer is supposed to output for each transition. The class uses shared memory for the lookups. + * that the finite-state transducer is supposed to output for each transition. The class uses shared + * memory for the lookups. * * @tparam OutSymbolT The symbol type being output * @tparam OutSymbolOffsetT Type sufficiently large to index into the lookup table of output symbols From 378be9f43e00e80e77b8f685bf4a65ca9a854dcc Mon Sep 17 00:00:00 2001 From: Elias Stehle <3958403+elstehle@users.noreply.github.com> Date: Wed, 20 Jul 2022 13:02:53 -0700 Subject: [PATCH 31/38] exchanges loops in favor of copy and fills --- cpp/src/io/fst/lookup_tables.cuh | 41 ++++++++++++++++++++++---------- 1 file changed, 28 insertions(+), 13 deletions(-) diff --git a/cpp/src/io/fst/lookup_tables.cuh b/cpp/src/io/fst/lookup_tables.cuh index 98ef49d893d..5884279e4db 100644 --- a/cpp/src/io/fst/lookup_tables.cuh +++ b/cpp/src/io/fst/lookup_tables.cuh @@ -116,6 +116,12 @@ class SingleSymbolSmemLUT { return private_storage; } + /** + * @brief Initializes the lookup table, primarily to be invoked from within device code but also + * provides host-side implementation for verification. + * @note Synchronizes the thread block, if called from device, and, hence, requires all threads + * of the thread block to call the constructor + */ constexpr CUDF_HOST_DEVICE SingleSymbolSmemLUT(KernelParameter const& kernel_param, TempStorage& temp_storage) : temp_storage(temp_storage.Alias()), num_valid_entries(kernel_param.num_valid_entries) @@ -140,6 +146,13 @@ class SingleSymbolSmemLUT { } }; +/** + * @brief Lookup table mapping (old_state, symbol_group_id) transitions to a new target state. The + * class uses shared memory for the lookups. + * + * @tparam MAX_NUM_SYMBOLS The maximum number of symbols being output by a single state transition + * @tparam MAX_NUM_STATES The maximum number of states that this lookup table shall support + */ template class TransitionTable { private: @@ -187,9 +200,8 @@ class TransitionTable { } __syncthreads(); #else - for (int i = 0; i < MAX_NUM_STATES * MAX_NUM_SYMBOLS; i++) { - this->temp_storage.transitions[i] = kernel_param.transitions[i]; - } + std::copy_n( + kernel_param.transitions, MAX_NUM_STATES * MAX_NUM_SYMBOLS, this->temp_storage.transitions); #endif } @@ -299,8 +311,11 @@ class TransducerLookupTable { OutSymbolT d_out_symbols[MAX_TABLE_SIZE]; }; - /** - * @brief Initializes the translation table (both the host and device parts) + /** + * @brief Initializes the lookup table, primarily to be invoked from within device code but also + * provides host-side implementation for verification. + * @note Synchronizes the thread block, if called from device, and, hence, requires all threads + * of the thread block to call the constructor */ static void InitDeviceTranslationTable( hostdevice_vector& translation_table_init, @@ -329,7 +344,7 @@ class TransducerLookupTable { if (MAX_NUM_SYMBOLS > num_added) { int32_t count = MAX_NUM_SYMBOLS - num_added; auto begin_it = std::prev(std::end(out_symbol_offsets)); - std::copy(begin_it, begin_it + count, std::back_inserter(out_symbol_offsets)); + std::fill_n(begin_it, count, out_symbol_offsets[0]); } } @@ -359,7 +374,9 @@ class TransducerLookupTable { public: /** - * @brief Synchronizes the thread block, if called from device, and, hence, requires all threads + * @brief Initializes the lookup table, primarily to be invoked from within device code but also + * provides host-side implementation for verification. + * @note Synchronizes the thread block, if called from device, and, hence, requires all threads * of the thread block to call the constructor */ CUDF_HOST_DEVICE TransducerLookupTable(KernelParameter const& kernel_param, @@ -379,12 +396,10 @@ class TransducerLookupTable { } __syncthreads(); #else - for (int i = 0; i < num_offsets; i++) { - this->temp_storage.out_symbol_offsets[i] = kernel_param.d_out_offsets[i]; - } - for (int i = 0; i < this->temp_storage.out_symbol_offsets[i]; i++) { - this->temp_storage.out_symbols[i] = kernel_param.d_out_symbols[i]; - } + std::copy_n(kernel_param.d_out_offsets, num_offsets, this->temp_storage.out_symbol_offsets); + std::copy_n(kernel_param.d_out_symbols, + this->temp_storage.out_symbol_offsets, + this->temp_storage.out_symbols); #endif } From 4ba547227a5e90e01e995df9c76ae2a284548c1b Mon Sep 17 00:00:00 2001 From: Elias Stehle <3958403+elstehle@users.noreply.github.com> Date: Wed, 20 Jul 2022 15:00:35 -0700 Subject: [PATCH 32/38] clarifies documentation in agent dfa --- cpp/src/io/fst/agent_dfa.cuh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cpp/src/io/fst/agent_dfa.cuh b/cpp/src/io/fst/agent_dfa.cuh index 77f04555dc8..d847598d6dd 100644 --- a/cpp/src/io/fst/agent_dfa.cuh +++ b/cpp/src/io/fst/agent_dfa.cuh @@ -28,8 +28,8 @@ namespace cudf::io::fst::detail { using StateIndexT = uint32_t; /** - * @brief Implements an associative composition operation for state transition vectors and - * offset-to-overap vectors to be used with a prefix scan. + * @brief Implements an associative composition operation for state transition vectors to be used + * with a prefix scan. * * Read the following table as follows: c = op(l,r), where op is the composition operator. * For row 0: l maps 0 to 2. r maps 2 to 2. Hence, the result for 0 is 2. From 7980978a97bb3650a3c16f62c87584a352c6b991 Mon Sep 17 00:00:00 2001 From: Elias Stehle <3958403+elstehle@users.noreply.github.com> Date: Wed, 20 Jul 2022 15:26:50 -0700 Subject: [PATCH 33/38] disambiguates transition and translation tables --- cpp/src/io/fst/lookup_tables.cuh | 12 +++++++++++- cpp/tests/io/fst/fst_test.cu | 15 +++++++++------ 2 files changed, 20 insertions(+), 7 deletions(-) diff --git a/cpp/src/io/fst/lookup_tables.cuh b/cpp/src/io/fst/lookup_tables.cuh index 5884279e4db..341c3b7a51d 100644 --- a/cpp/src/io/fst/lookup_tables.cuh +++ b/cpp/src/io/fst/lookup_tables.cuh @@ -311,7 +311,7 @@ class TransducerLookupTable { OutSymbolT d_out_symbols[MAX_TABLE_SIZE]; }; - /** + /** * @brief Initializes the lookup table, primarily to be invoked from within device code but also * provides host-side implementation for verification. * @note Synchronizes the thread block, if called from device, and, hence, requires all threads @@ -464,6 +464,16 @@ class Dfa { } public: + /** + * @brief Constructs a new DFA. + * + * @param symbol_vec Sequence container of symbol groups. Each symbol group is a sequence + * container to symbols within that group. The index of the symbol group containing a symbol being + * read will be used as symbol_gid of the transition and translation tables. + * @param tt_vec The transition table + * @param out_tt_vec The translation table + * @param stream The stream to which memory operations and kernels are getting dispatched to + */ template Dfa(SymbolGroupIdItT const& symbol_vec, std::vector> const& tt_vec, diff --git a/cpp/tests/io/fst/fst_test.cu b/cpp/tests/io/fst/fst_test.cu index 606fe6e7bcd..757410b7e65 100644 --- a/cpp/tests/io/fst/fst_test.cu +++ b/cpp/tests/io/fst/fst_test.cu @@ -91,15 +91,18 @@ static std::pair fst_baseline(InputItT begin, // Iterate over symbol groups and search for the first symbol group containing the current // symbol, if no match is found we use cend(symbol_group_lut) as the "catch-all" symbol group - auto symbol_group_it = std::find_if(std::cbegin(symbol_group_lut), std::cend(symbol_group_lut), - [symbol](auto& sg) { - return std::find(std::cbegin(sg), std::cend(sg), symbol) != std::cend(sg); - }); + auto symbol_group_it = + std::find_if(std::cbegin(symbol_group_lut), std::cend(symbol_group_lut), [symbol](auto& sg) { + return std::find(std::cbegin(sg), std::cend(sg), symbol) != std::cend(sg); + }); auto symbol_group = std::distance(std::cbegin(symbol_group_lut), symbol_group_it); // Output the translated symbols to the output tape - out_tape = std::copy(std::cbegin(translation_table[state][symbol_group]), std::cend(translation_table[state][symbol_group]), out_tape); - auto out_size = std::distance(std::cbegin(translation_table[state][symbol_group]), std::cend(translation_table[state][symbol_group])); + out_tape = std::copy(std::cbegin(translation_table[state][symbol_group]), + std::cend(translation_table[state][symbol_group]), + out_tape); + auto out_size = std::distance(std::cbegin(translation_table[state][symbol_group]), + std::cend(translation_table[state][symbol_group])); out_index_tape = std::fill_n(out_index_tape, out_size, in_offset); // Transition the state of the finite-state machine From 2bce0616851b47fb7e7cafb39adf15eb28454018 Mon Sep 17 00:00:00 2001 From: Elias Stehle <3958403+elstehle@users.noreply.github.com> Date: Wed, 20 Jul 2022 21:58:40 -0700 Subject: [PATCH 34/38] minor style fix --- cpp/tests/io/fst/fst_test.cu | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/cpp/tests/io/fst/fst_test.cu b/cpp/tests/io/fst/fst_test.cu index 757410b7e65..40866d8bc8e 100644 --- a/cpp/tests/io/fst/fst_test.cu +++ b/cpp/tests/io/fst/fst_test.cu @@ -98,11 +98,13 @@ static std::pair fst_baseline(InputItT begin, auto symbol_group = std::distance(std::cbegin(symbol_group_lut), symbol_group_it); // Output the translated symbols to the output tape - out_tape = std::copy(std::cbegin(translation_table[state][symbol_group]), - std::cend(translation_table[state][symbol_group]), - out_tape); - auto out_size = std::distance(std::cbegin(translation_table[state][symbol_group]), - std::cend(translation_table[state][symbol_group])); + out_tape = std::copy(std::cbegin(translation_table[state][symbol_group]), + std::cend(translation_table[state][symbol_group]), + out_tape); + + auto out_size = std::distance(std::cbegin(translation_table[state][symbol_group]), + std::cend(translation_table[state][symbol_group])); + out_index_tape = std::fill_n(out_index_tape, out_size, in_offset); // Transition the state of the finite-state machine From b37f71634104360e9be0e6c5b0ec69f3482ec975 Mon Sep 17 00:00:00 2001 From: Elias Stehle <3958403+elstehle@users.noreply.github.com> Date: Thu, 21 Jul 2022 06:20:48 -0700 Subject: [PATCH 35/38] if constexprs and doxy on DFA helper --- cpp/src/io/fst/dispatch_dfa.cuh | 6 +++--- cpp/src/io/fst/in_reg_array.cuh | 4 ++-- cpp/src/io/fst/lookup_tables.cuh | 24 ++++++++++++++++++++++++ 3 files changed, 29 insertions(+), 5 deletions(-) diff --git a/cpp/src/io/fst/dispatch_dfa.cuh b/cpp/src/io/fst/dispatch_dfa.cuh index 2a5594c383d..cabbe863131 100644 --- a/cpp/src/io/fst/dispatch_dfa.cuh +++ b/cpp/src/io/fst/dispatch_dfa.cuh @@ -375,14 +375,14 @@ struct DispatchFSM : DeviceFSMPolicy { allocation_sizes[MEM_SCAN] = vector_scan_storage_bytes; // Bytes needed for tile status descriptors (fusing state-transition vector + DFA simulation) - if (SINGLE_PASS_STV) { + if constexpr (SINGLE_PASS_STV) { error = ScanTileStateT::AllocationSize(num_blocks, allocation_sizes[MEM_SINGLE_PASS_STV]); if (error != cudaSuccess) return error; } // Bytes needed for tile status descriptors (DFA simulation pass for output size computation + // output-generating pass) - if (IS_FST) { + if constexpr (IS_FST) { error = FstScanTileStateT::AllocationSize(num_blocks, allocation_sizes[MEM_FST_OFFSET]); if (error != cudaSuccess) return error; } @@ -404,7 +404,7 @@ struct DispatchFSM : DeviceFSMPolicy { // INITIALIZE SCAN TILE STATES COMPUTING TRANSDUCED OUTPUT OFFSETS //------------------------------------------------------------------------------ FstScanTileStateT fst_offset_tile_state; - if (IS_FST) { + if constexpr (IS_FST) { // Construct the tile status (aliases memory internally et al.) error = fst_offset_tile_state.Init( num_blocks, allocations[MEM_FST_OFFSET], allocation_sizes[MEM_FST_OFFSET]); diff --git a/cpp/src/io/fst/in_reg_array.cuh b/cpp/src/io/fst/in_reg_array.cuh index 87ea2d3325f..e26d494a557 100644 --- a/cpp/src/io/fst/in_reg_array.cuh +++ b/cpp/src/io/fst/in_reg_array.cuh @@ -62,7 +62,7 @@ class MultiFragmentInRegArray { /** * @brief Returns the \p num_bits bits starting at \p bit_start */ - CUDF_HOST_DEVICE uint32_t bfe(const uint32_t& data, uint32_t bit_start, uint32_t num_bits) const + CUDF_HOST_DEVICE [[nodiscard]] uint32_t bfe(const uint32_t& data, uint32_t bit_start, uint32_t num_bits) const { #if CUB_PTX_ARCH > 0 return cub::BFE(data, bit_start, num_bits); @@ -98,7 +98,7 @@ class MultiFragmentInRegArray { // ACCESSORS //------------------------------------------------------------------------------ public: - CUDF_HOST_DEVICE uint32_t Get(int32_t index) const + CUDF_HOST_DEVICE [[nodiscard]] uint32_t Get(int32_t index) const { uint32_t val = 0; diff --git a/cpp/src/io/fst/lookup_tables.cuh b/cpp/src/io/fst/lookup_tables.cuh index 341c3b7a51d..b364b2cc3d7 100644 --- a/cpp/src/io/fst/lookup_tables.cuh +++ b/cpp/src/io/fst/lookup_tables.cuh @@ -496,6 +496,30 @@ class Dfa { TranslationTableT::InitDeviceTranslationTable(translation_table_init, out_tt_vec, stream); } + /** + * @brief Dispatches the finite-state transducer algorithm to the GPU. + * + * @tparam SymbolT The atomic symbol type from the input tape + * @tparam TransducedOutItT Random-access output iterator to which the transduced output will be + * written + * @tparam TransducedIndexOutItT Random-access output iterator type to which the indexes of the + * symbols that caused some output to be written. + * @tparam TransducedCountOutItT A single-item output iterator type to which the total number of + * output symbols is written + * @tparam OffsetT A type large enough to index into either of both: (a) the input symbols and (b) + * the output symbols + * @param d_chars Pointer to the input string of symbols + * @param num_chars The total number of input symbols to process + * @param d_out_it Random-access output iterator to which the transduced output is + * written + * @param d_out_idx_it Random-access output iterator to which, the index i is written + * iff the i-th input symbol caused some output to be written + * @param d_num_transduced_out_it A single-item output iterator type to which the total number + * of output symbols is written + * @param seed_state The DFA's starting state. For streaming DFAs this corresponds to the + * "end-state" of the previous invocation of the algorithm. + * @param stream CUDA stream to launch kernels within. Default is the null-stream. + */ template Date: Thu, 21 Jul 2022 07:56:31 -0700 Subject: [PATCH 36/38] minor documentation fix --- cpp/src/io/fst/device_dfa.cuh | 4 ++-- cpp/src/io/fst/lookup_tables.cuh | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/cpp/src/io/fst/device_dfa.cuh b/cpp/src/io/fst/device_dfa.cuh index 56afc32e643..7eeff27eef1 100644 --- a/cpp/src/io/fst/device_dfa.cuh +++ b/cpp/src/io/fst/device_dfa.cuh @@ -31,8 +31,8 @@ namespace cudf::io::fst { * @tparam SymbolItT Random-access input iterator type to symbols fed into the FST * @tparam TransducedOutItT Random-access output iterator to which the transduced output will be * written - * @tparam TransducedIndexOutItT Random-access output iterator type to which the indexes of the - * symbols that caused some output to be written. + * @tparam TransducedIndexOutItT Random-access output iterator type to which the input symbols' + * indexes are written. * @tparam TransducedCountOutItT A single-item output iterator type to which the total number of * output symbols is written * @tparam OffsetT A type large enough to index into either of both: (a) the input symbols and (b) diff --git a/cpp/src/io/fst/lookup_tables.cuh b/cpp/src/io/fst/lookup_tables.cuh index b364b2cc3d7..c5033868925 100644 --- a/cpp/src/io/fst/lookup_tables.cuh +++ b/cpp/src/io/fst/lookup_tables.cuh @@ -502,8 +502,8 @@ class Dfa { * @tparam SymbolT The atomic symbol type from the input tape * @tparam TransducedOutItT Random-access output iterator to which the transduced output will be * written - * @tparam TransducedIndexOutItT Random-access output iterator type to which the indexes of the - * symbols that caused some output to be written. + * @tparam TransducedIndexOutItT Random-access output iterator type to which the input symbols' + * indexes are written. * @tparam TransducedCountOutItT A single-item output iterator type to which the total number of * output symbols is written * @tparam OffsetT A type large enough to index into either of both: (a) the input symbols and (b) From 6c889f70e71e2cd750ca780d12812feff3015983 Mon Sep 17 00:00:00 2001 From: Elias Stehle <3958403+elstehle@users.noreply.github.com> Date: Thu, 21 Jul 2022 09:52:37 -0700 Subject: [PATCH 37/38] replaces loop for comparing vectors with generic macro --- cpp/include/cudf_test/cudf_gtest.hpp | 9 +++++++++ cpp/src/io/fst/in_reg_array.cuh | 4 +++- cpp/tests/io/fst/fst_test.cu | 8 ++------ 3 files changed, 14 insertions(+), 7 deletions(-) diff --git a/cpp/include/cudf_test/cudf_gtest.hpp b/cpp/include/cudf_test/cudf_gtest.hpp index 6c62b0159ca..fb2680545d3 100644 --- a/cpp/include/cudf_test/cudf_gtest.hpp +++ b/cpp/include/cudf_test/cudf_gtest.hpp @@ -176,3 +176,12 @@ struct TypeList> { } catch (std::exception & e) { \ FAIL() << "statement:" << #statement << std::endl << "reason: " << e.what() << std::endl; \ } + +/** + * @brief test macro comparing for equality of \p lhs and and \p rhs for the first \p size elements. + */ +#define CUDF_TEST_EXPECT_VECTOR_EQUAL(lhs, rhs, size) \ + do { \ + for (decltype(size) i = 0; i < size; i++) \ + EXPECT_EQ(lhs[i], rhs[i]) << "Mismatch at index #" << i; \ + } while (0) diff --git a/cpp/src/io/fst/in_reg_array.cuh b/cpp/src/io/fst/in_reg_array.cuh index e26d494a557..0819deb6d97 100644 --- a/cpp/src/io/fst/in_reg_array.cuh +++ b/cpp/src/io/fst/in_reg_array.cuh @@ -62,7 +62,9 @@ class MultiFragmentInRegArray { /** * @brief Returns the \p num_bits bits starting at \p bit_start */ - CUDF_HOST_DEVICE [[nodiscard]] uint32_t bfe(const uint32_t& data, uint32_t bit_start, uint32_t num_bits) const + CUDF_HOST_DEVICE [[nodiscard]] uint32_t bfe(const uint32_t& data, + uint32_t bit_start, + uint32_t num_bits) const { #if CUB_PTX_ARCH > 0 return cub::BFE(data, bit_start, num_bits); diff --git a/cpp/tests/io/fst/fst_test.cu b/cpp/tests/io/fst/fst_test.cu index 40866d8bc8e..e198c804222 100644 --- a/cpp/tests/io/fst/fst_test.cu +++ b/cpp/tests/io/fst/fst_test.cu @@ -255,12 +255,8 @@ TEST_F(FstTest, GroundTruth) // Verify results ASSERT_EQ(output_gpu_size[0], output_cpu.size()); - for (std::size_t i = 0; i < output_cpu.size(); i++) { - EXPECT_EQ(output_gpu[i], output_cpu[i]) << "Mismatch at index #" << i; - } - for (std::size_t i = 0; i < output_cpu.size(); i++) { - EXPECT_EQ(out_indexes_gpu[i], out_index_cpu[i]) << "Mismatch at index #" << i; - } + CUDF_TEST_EXPECT_VECTOR_EQUAL(output_gpu, output_cpu, output_cpu.size()); + CUDF_TEST_EXPECT_VECTOR_EQUAL(out_indexes_gpu, out_index_cpu, output_cpu.size()); } CUDF_TEST_PROGRAM_MAIN() From 8a54c728aeccb01d1e6b6ee188901b3973986d55 Mon Sep 17 00:00:00 2001 From: Elias Stehle <3958403+elstehle@users.noreply.github.com> Date: Thu, 21 Jul 2022 09:52:57 -0700 Subject: [PATCH 38/38] uses new vector comparison for logical stack test --- cpp/tests/io/fst/logical_stack_test.cu | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/cpp/tests/io/fst/logical_stack_test.cu b/cpp/tests/io/fst/logical_stack_test.cu index d76ff16f85d..dda737f005d 100644 --- a/cpp/tests/io/fst/logical_stack_test.cu +++ b/cpp/tests/io/fst/logical_stack_test.cu @@ -244,9 +244,7 @@ TEST_F(LogicalStackTest, GroundTruth) // Verify results ASSERT_EQ(string_size, top_of_stack_cpu.size()); ASSERT_EQ(top_of_stack_gpu.size(), top_of_stack_cpu.size()); - for (size_t i = 0; i < string_size && i < top_of_stack_cpu.size(); i++) { - ASSERT_EQ(top_of_stack_gpu.host_ptr()[i], top_of_stack_cpu[i]) << "Mismatch at index #" << i; - } + CUDF_TEST_EXPECT_VECTOR_EQUAL(top_of_stack_gpu.host_ptr(), top_of_stack_cpu, string_size); } CUDF_TEST_PROGRAM_MAIN()