Skip to content

Commit

Permalink
clean up & addressing review comments
Browse files Browse the repository at this point in the history
  • Loading branch information
elstehle committed Jul 13, 2022
1 parent 0557d41 commit 355d1e4
Show file tree
Hide file tree
Showing 3 changed files with 38 additions and 36 deletions.
28 changes: 15 additions & 13 deletions cpp/src/io/fst/agent_dfa.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,6 @@

#include "in_reg_array.cuh"

#include <cudf/types.hpp>

#include <cub/cub.cuh>

namespace cudf {
Expand All @@ -40,13 +38,13 @@ template <typename StateIndexT, int32_t NUM_ITEMS>
class MultiItemStateVector {
public:
template <typename IndexT>
constexpr CUDF_HOST_DEVICE void Set(IndexT index, StateIndexT value) noexcept
__host__ __device__ __forceinline__ void Set(IndexT index, StateIndexT value) noexcept
{
state_[index] = value;
}

template <typename IndexT>
constexpr CUDF_HOST_DEVICE StateIndexT Get(IndexT index) const noexcept
__host__ __device__ __forceinline__ StateIndexT Get(IndexT index) const noexcept
{
return state_[index];
}
Expand All @@ -71,7 +69,7 @@ class MultiItemStateVector {
template <int32_t NUM_ITEMS>
struct VectorCompositeOp {
template <typename VectorT>
constexpr CUDF_HOST_DEVICE VectorT operator()(VectorT const& lhs, VectorT const& rhs)
__host__ __device__ __forceinline__ VectorT operator()(VectorT const& lhs, VectorT const& rhs)
{
VectorT res;
for (int32_t i = 0; i < NUM_ITEMS; ++i) {
Expand Down Expand Up @@ -109,8 +107,7 @@ class DFASimulationCallbackWrapper {
uint32_t count = transducer_table(old_state.Get(0), symbol_id);
if (write) {
for (uint32_t out_char = 0; out_char < count; out_char++) {
out_it[out_count + out_char] =
transducer_table(old_state.Get(0), symbol_id, out_char);
out_it[out_count + out_char] = transducer_table(old_state.Get(0), symbol_id, out_char);
out_idx_it[out_count + out_char] = offset + character_index;
}
}
Expand Down Expand Up @@ -188,8 +185,8 @@ struct StateTransitionOp {
__host__ __device__ __forceinline__ void ReadSymbol(const CharIndexT& character_index,
const SymbolIndexT& read_symbol_id)
{
using TransitionVectorT= typename TransitionTableT::TransitionVectorT ;
old_state_vector = state_vector;
using TransitionVectorT = typename TransitionTableT::TransitionVectorT;
old_state_vector = state_vector;
state_vector.Set(0, transition_table(state_vector.Get(0), read_symbol_id));
callback_op.ReadSymbol(character_index, old_state_vector, state_vector, read_symbol_id);
}
Expand Down Expand Up @@ -344,7 +341,8 @@ struct AgentDFA {
{
AliasedLoadT thread_units[UINTS_PER_THREAD];

const AliasedLoadT* d_block_symbols = reinterpret_cast<const AliasedLoadT*>(d_chars + block_offset);
const AliasedLoadT* d_block_symbols =
reinterpret_cast<const AliasedLoadT*>(d_chars + block_offset);
cub::LoadDirectStriped<BLOCK_THREADS>(threadIdx.x, d_block_symbols, thread_units);

#pragma unroll
Expand All @@ -370,7 +368,8 @@ struct AgentDFA {
OffsetT num_total_units =
CUB_QUOTIENT_CEILING(num_total_symbols - block_offset, sizeof(AliasedLoadT));

const AliasedLoadT* d_block_symbols = reinterpret_cast<const AliasedLoadT*>(d_chars + block_offset);
const AliasedLoadT* d_block_symbols =
reinterpret_cast<const AliasedLoadT*>(d_chars + block_offset);
cub::LoadDirectStriped<BLOCK_THREADS>(
threadIdx.x, d_block_symbols, thread_units, num_total_units);

Expand Down Expand Up @@ -419,7 +418,8 @@ struct AgentDFA {
const OffsetT num_total_symbols,
StateVectorT& state_vector)
{
using StateVectorTransitionOpT = StateVectorTransitionOp<NUM_STATES, StateVectorT, TransitionTableT>;
using StateVectorTransitionOpT =
StateVectorTransitionOp<NUM_STATES, StateVectorT, TransitionTableT>;

// Start parsing and to transition states
StateVectorTransitionOpT transition_op(transition_table, state_vector);
Expand Down Expand Up @@ -650,7 +650,9 @@ __launch_bounds__(int32_t(AgentDFAPolicy::BLOCK_THREADS)) __global__
}

// Perform finite-state machine simulation, computing size of transduced output
DFASimulationCallbackWrapper<decltype(dfa.InitTranslationTable(transducer_table_storage)), TransducedOutItT, TransducedIndexOutItT>
DFASimulationCallbackWrapper<decltype(dfa.InitTranslationTable(transducer_table_storage)),
TransducedOutItT,
TransducedIndexOutItT>
callback_wrapper(transducer_table, transduced_out_it, transduced_out_idx_it);

MultiItemStateVector<int32_t, SINGLE_ITEM_COUNT> t_start_state;
Expand Down
4 changes: 2 additions & 2 deletions cpp/src/io/fst/dispatch_dfa.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -119,7 +119,7 @@ struct DispatchFSM : DeviceFSMPolicy {
TransducedIndexOutItT transduced_out_idx_it;
TransducedCountOutItT d_num_transduced_out_it;
cudaStream_t stream;
int ptx_version;
int const ptx_version;

//------------------------------------------------------------------------------
// CONSTRUCTOR
Expand Down Expand Up @@ -422,7 +422,7 @@ struct DispatchFSM : DeviceFSMPolicy {
// COMPUTE STATE-TRANSITION VECTORS
//------------------------------------------------------------------------------
ScanTileStateT stv_tile_state;
if constexpr(SINGLE_PASS_STV) {
if constexpr (SINGLE_PASS_STV) {
// Construct the tile status (aliases memory internally et al.)
error = stv_tile_state.Init(
num_blocks, allocations[MEM_SINGLE_PASS_STV], allocation_sizes[MEM_SINGLE_PASS_STV]);
Expand Down
42 changes: 21 additions & 21 deletions cpp/src/io/fst/in_reg_array.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -35,56 +35,55 @@ namespace detail {
* @tparam BackingFragmentT The data type that is holding the fragments
*/
template <uint32_t NUM_ITEMS, uint32_t MAX_ITEM_VALUE, typename BackingFragmentT = uint32_t>
struct MultiFragmentInRegArray {
/// [b] Minimum number of bits required to represent all values from [0, MAX_ITEM_VALUE]
class MultiFragmentInRegArray {
private:
/// Minimum number of bits required to represent all values from [0, MAX_ITEM_VALUE]
static constexpr uint32_t MIN_BITS_PER_ITEM =
(MAX_ITEM_VALUE == 0) ? 1 : cub::Log2<(MAX_ITEM_VALUE + 1)>::VALUE;

/// Number of bits that each fragment can store
static constexpr uint32_t NUM_BITS_PER_FRAGMENT = sizeof(BackingFragmentT) * 8;

/// [a] The number of bits per fragment per item in the array
/// The number of bits per fragment per item in the array
static constexpr uint32_t AVAIL_BITS_PER_FRAG_ITEM = NUM_BITS_PER_FRAGMENT / NUM_ITEMS;

/// [k] The number of bits per item per fragment to be a power of two to avoid costly integer
/// The number of bits per item per fragment to be a power of two to avoid costly integer
/// multiplication
/// TODO: specialise for VOLTA and later architectures that have efficient integer multiplication
static constexpr uint32_t BITS_PER_FRAG_ITEM =
0x01U << (cub::Log2<(AVAIL_BITS_PER_FRAG_ITEM + 1)>::VALUE - 1);
static constexpr uint32_t LOG2_BITS_PER_FRAG_ITEM = cub::Log2<BITS_PER_FRAG_ITEM>::VALUE;

// [f] Number of fragments required to store and to reconstruct an item
// Number of fragments required to store and to reconstruct each item
static constexpr uint32_t FRAGMENTS_PER_ITEM =
(MIN_BITS_PER_ITEM + BITS_PER_FRAG_ITEM - 1) / BITS_PER_FRAG_ITEM;

//------------------------------------------------------------------------------
// MEMBER VARIABLES
// HELPER FUNCTIONS
//------------------------------------------------------------------------------
__device__ __host__ __forceinline__ unsigned int bfe(const unsigned int& data,
unsigned int bit_start,
unsigned int num_bits) const
__device__ __host__ __forceinline__ uint32_t bfe(const uint32_t& data,
uint32_t bit_start,
uint32_t num_bits) const
{
#if CUB_PTX_ARCH > 0
return cub::BFE(data, bit_start, num_bits);
#else
const unsigned int MASK = (1 << num_bits) - 1;
const uint32_t MASK = (1 << num_bits) - 1;
return (data >> bit_start) & MASK;
#endif
}

__device__ __host__ __forceinline__ void bfi(unsigned int& data,
unsigned int bits,
unsigned int bit_start,
unsigned int num_bits) const
__device__ __host__ __forceinline__ void bfi(uint32_t& data,
uint32_t bits,
uint32_t bit_start,
uint32_t num_bits) const
{
#if CUB_PTX_ARCH > 0
cub::BFI(data, data, bits, bit_start, num_bits);
#else
unsigned int x = bits << bit_start;
unsigned int y = data;
unsigned int MASK_X = ((1 << num_bits) - 1) << bit_start;
unsigned int MASK_Y = ~MASK_X;
data = (y & MASK_Y) | (x & MASK_X);
uint32_t x = bits << bit_start;
uint32_t y = data;
uint32_t MASK_X = ((1 << num_bits) - 1) << bit_start;
uint32_t MASK_Y = ~MASK_X;
data = (y & MASK_Y) | (x & MASK_X);
#endif
}

Expand All @@ -93,6 +92,7 @@ struct MultiFragmentInRegArray {
//------------------------------------------------------------------------------
// ACCESSORS
//------------------------------------------------------------------------------
public:
__host__ __device__ __forceinline__ uint32_t Get(int32_t index) const
{
uint32_t val = 0;
Expand Down

0 comments on commit 355d1e4

Please sign in to comment.