From 355d1e43e29e4eeadc21f9d4d9e6aa43ee8afe9b Mon Sep 17 00:00:00 2001 From: Elias Stehle <3958403+elstehle@users.noreply.github.com> Date: Wed, 20 Apr 2022 05:11:32 -0700 Subject: [PATCH] clean up & addressing review comments --- cpp/src/io/fst/agent_dfa.cuh | 28 ++++++++++++---------- cpp/src/io/fst/dispatch_dfa.cuh | 4 ++-- cpp/src/io/fst/in_reg_array.cuh | 42 ++++++++++++++++----------------- 3 files changed, 38 insertions(+), 36 deletions(-) diff --git a/cpp/src/io/fst/agent_dfa.cuh b/cpp/src/io/fst/agent_dfa.cuh index d983f9287a9..0611973f78c 100644 --- a/cpp/src/io/fst/agent_dfa.cuh +++ b/cpp/src/io/fst/agent_dfa.cuh @@ -17,8 +17,6 @@ #include "in_reg_array.cuh" -#include - #include namespace cudf { @@ -40,13 +38,13 @@ template class MultiItemStateVector { public: template - constexpr CUDF_HOST_DEVICE void Set(IndexT index, StateIndexT value) noexcept + __host__ __device__ __forceinline__ void Set(IndexT index, StateIndexT value) noexcept { state_[index] = value; } template - constexpr CUDF_HOST_DEVICE StateIndexT Get(IndexT index) const noexcept + __host__ __device__ __forceinline__ StateIndexT Get(IndexT index) const noexcept { return state_[index]; } @@ -71,7 +69,7 @@ class MultiItemStateVector { template struct VectorCompositeOp { template - constexpr CUDF_HOST_DEVICE VectorT operator()(VectorT const& lhs, VectorT const& rhs) + __host__ __device__ __forceinline__ VectorT operator()(VectorT const& lhs, VectorT const& rhs) { VectorT res; for (int32_t i = 0; i < NUM_ITEMS; ++i) { @@ -109,8 +107,7 @@ class DFASimulationCallbackWrapper { uint32_t count = transducer_table(old_state.Get(0), symbol_id); if (write) { for (uint32_t out_char = 0; out_char < count; out_char++) { - out_it[out_count + out_char] = - transducer_table(old_state.Get(0), symbol_id, out_char); + out_it[out_count + out_char] = transducer_table(old_state.Get(0), symbol_id, out_char); out_idx_it[out_count + out_char] = offset + character_index; } } @@ -188,8 +185,8 @@ struct StateTransitionOp { __host__ __device__ __forceinline__ void ReadSymbol(const CharIndexT& character_index, const SymbolIndexT& read_symbol_id) { - using TransitionVectorT= typename TransitionTableT::TransitionVectorT ; - old_state_vector = state_vector; + using TransitionVectorT = typename TransitionTableT::TransitionVectorT; + old_state_vector = state_vector; state_vector.Set(0, transition_table(state_vector.Get(0), read_symbol_id)); callback_op.ReadSymbol(character_index, old_state_vector, state_vector, read_symbol_id); } @@ -344,7 +341,8 @@ struct AgentDFA { { AliasedLoadT thread_units[UINTS_PER_THREAD]; - const AliasedLoadT* d_block_symbols = reinterpret_cast(d_chars + block_offset); + const AliasedLoadT* d_block_symbols = + reinterpret_cast(d_chars + block_offset); cub::LoadDirectStriped(threadIdx.x, d_block_symbols, thread_units); #pragma unroll @@ -370,7 +368,8 @@ struct AgentDFA { OffsetT num_total_units = CUB_QUOTIENT_CEILING(num_total_symbols - block_offset, sizeof(AliasedLoadT)); - const AliasedLoadT* d_block_symbols = reinterpret_cast(d_chars + block_offset); + const AliasedLoadT* d_block_symbols = + reinterpret_cast(d_chars + block_offset); cub::LoadDirectStriped( threadIdx.x, d_block_symbols, thread_units, num_total_units); @@ -419,7 +418,8 @@ struct AgentDFA { const OffsetT num_total_symbols, StateVectorT& state_vector) { - using StateVectorTransitionOpT = StateVectorTransitionOp; + using StateVectorTransitionOpT = + StateVectorTransitionOp; // Start parsing and to transition states StateVectorTransitionOpT transition_op(transition_table, state_vector); @@ -650,7 +650,9 @@ __launch_bounds__(int32_t(AgentDFAPolicy::BLOCK_THREADS)) __global__ } // Perform finite-state machine simulation, computing size of transduced output - DFASimulationCallbackWrapper + DFASimulationCallbackWrapper callback_wrapper(transducer_table, transduced_out_it, transduced_out_idx_it); MultiItemStateVector t_start_state; diff --git a/cpp/src/io/fst/dispatch_dfa.cuh b/cpp/src/io/fst/dispatch_dfa.cuh index fc14faaf10a..316d6ea0d5f 100644 --- a/cpp/src/io/fst/dispatch_dfa.cuh +++ b/cpp/src/io/fst/dispatch_dfa.cuh @@ -119,7 +119,7 @@ struct DispatchFSM : DeviceFSMPolicy { TransducedIndexOutItT transduced_out_idx_it; TransducedCountOutItT d_num_transduced_out_it; cudaStream_t stream; - int ptx_version; + int const ptx_version; //------------------------------------------------------------------------------ // CONSTRUCTOR @@ -422,7 +422,7 @@ struct DispatchFSM : DeviceFSMPolicy { // COMPUTE STATE-TRANSITION VECTORS //------------------------------------------------------------------------------ ScanTileStateT stv_tile_state; - if constexpr(SINGLE_PASS_STV) { + if constexpr (SINGLE_PASS_STV) { // Construct the tile status (aliases memory internally et al.) error = stv_tile_state.Init( num_blocks, allocations[MEM_SINGLE_PASS_STV], allocation_sizes[MEM_SINGLE_PASS_STV]); diff --git a/cpp/src/io/fst/in_reg_array.cuh b/cpp/src/io/fst/in_reg_array.cuh index f9619c82fe8..ed5948249d4 100644 --- a/cpp/src/io/fst/in_reg_array.cuh +++ b/cpp/src/io/fst/in_reg_array.cuh @@ -35,56 +35,55 @@ namespace detail { * @tparam BackingFragmentT The data type that is holding the fragments */ template -struct MultiFragmentInRegArray { - /// [b] Minimum number of bits required to represent all values from [0, MAX_ITEM_VALUE] +class MultiFragmentInRegArray { + private: + /// Minimum number of bits required to represent all values from [0, MAX_ITEM_VALUE] static constexpr uint32_t MIN_BITS_PER_ITEM = (MAX_ITEM_VALUE == 0) ? 1 : cub::Log2<(MAX_ITEM_VALUE + 1)>::VALUE; /// Number of bits that each fragment can store static constexpr uint32_t NUM_BITS_PER_FRAGMENT = sizeof(BackingFragmentT) * 8; - /// [a] The number of bits per fragment per item in the array + /// The number of bits per fragment per item in the array static constexpr uint32_t AVAIL_BITS_PER_FRAG_ITEM = NUM_BITS_PER_FRAGMENT / NUM_ITEMS; - /// [k] The number of bits per item per fragment to be a power of two to avoid costly integer + /// The number of bits per item per fragment to be a power of two to avoid costly integer /// multiplication - /// TODO: specialise for VOLTA and later architectures that have efficient integer multiplication static constexpr uint32_t BITS_PER_FRAG_ITEM = 0x01U << (cub::Log2<(AVAIL_BITS_PER_FRAG_ITEM + 1)>::VALUE - 1); - static constexpr uint32_t LOG2_BITS_PER_FRAG_ITEM = cub::Log2::VALUE; - // [f] Number of fragments required to store and to reconstruct an item + // Number of fragments required to store and to reconstruct each item static constexpr uint32_t FRAGMENTS_PER_ITEM = (MIN_BITS_PER_ITEM + BITS_PER_FRAG_ITEM - 1) / BITS_PER_FRAG_ITEM; //------------------------------------------------------------------------------ - // MEMBER VARIABLES + // HELPER FUNCTIONS //------------------------------------------------------------------------------ - __device__ __host__ __forceinline__ unsigned int bfe(const unsigned int& data, - unsigned int bit_start, - unsigned int num_bits) const + __device__ __host__ __forceinline__ uint32_t bfe(const uint32_t& data, + uint32_t bit_start, + uint32_t num_bits) const { #if CUB_PTX_ARCH > 0 return cub::BFE(data, bit_start, num_bits); #else - const unsigned int MASK = (1 << num_bits) - 1; + const uint32_t MASK = (1 << num_bits) - 1; return (data >> bit_start) & MASK; #endif } - __device__ __host__ __forceinline__ void bfi(unsigned int& data, - unsigned int bits, - unsigned int bit_start, - unsigned int num_bits) const + __device__ __host__ __forceinline__ void bfi(uint32_t& data, + uint32_t bits, + uint32_t bit_start, + uint32_t num_bits) const { #if CUB_PTX_ARCH > 0 cub::BFI(data, data, bits, bit_start, num_bits); #else - unsigned int x = bits << bit_start; - unsigned int y = data; - unsigned int MASK_X = ((1 << num_bits) - 1) << bit_start; - unsigned int MASK_Y = ~MASK_X; - data = (y & MASK_Y) | (x & MASK_X); + uint32_t x = bits << bit_start; + uint32_t y = data; + uint32_t MASK_X = ((1 << num_bits) - 1) << bit_start; + uint32_t MASK_Y = ~MASK_X; + data = (y & MASK_Y) | (x & MASK_X); #endif } @@ -93,6 +92,7 @@ struct MultiFragmentInRegArray { //------------------------------------------------------------------------------ // ACCESSORS //------------------------------------------------------------------------------ + public: __host__ __device__ __forceinline__ uint32_t Get(int32_t index) const { uint32_t val = 0;