From 2f1bc136326bc27b7e9f032abed6caaccf21e777 Mon Sep 17 00:00:00 2001 From: Elias Stehle <3958403+elstehle@users.noreply.github.com> Date: Mon, 13 May 2024 01:46:49 -0700 Subject: [PATCH 1/7] adds write-coalescing capabilities to fst --- cpp/benchmarks/io/fst.cu | 16 +- cpp/src/io/fst/agent_dfa.cuh | 322 +++++++++++++++++++++----- cpp/src/io/fst/lookup_tables.cuh | 68 ++++-- cpp/src/io/json/json_normalization.cu | 26 ++- cpp/src/io/json/nested_json_gpu.cu | 23 +- cpp/tests/io/fst/common.hpp | 2 + cpp/tests/io/fst/fst_test.cu | 4 +- 7 files changed, 366 insertions(+), 95 deletions(-) diff --git a/cpp/benchmarks/io/fst.cu b/cpp/benchmarks/io/fst.cu index ad19bdfdfcb..31f1bf8e70f 100644 --- a/cpp/benchmarks/io/fst.cu +++ b/cpp/benchmarks/io/fst.cu @@ -95,7 +95,9 @@ void BM_FST_JSON(nvbench::state& state) auto parser = cudf::io::fst::detail::make_fst( cudf::io::fst::detail::make_symbol_group_lut(pda_sgs), cudf::io::fst::detail::make_transition_table(pda_state_tt), - cudf::io::fst::detail::make_translation_table(pda_out_tt), + cudf::io::fst::detail::make_translation_table(pda_out_tt), stream); state.set_cuda_stream(nvbench::make_cuda_stream_view(stream.value())); @@ -134,7 +136,9 @@ void BM_FST_JSON_no_outidx(nvbench::state& state) auto parser = cudf::io::fst::detail::make_fst( cudf::io::fst::detail::make_symbol_group_lut(pda_sgs), cudf::io::fst::detail::make_transition_table(pda_state_tt), - cudf::io::fst::detail::make_translation_table(pda_out_tt), + cudf::io::fst::detail::make_translation_table(pda_out_tt), stream); state.set_cuda_stream(nvbench::make_cuda_stream_view(stream.value())); @@ -171,7 +175,9 @@ void BM_FST_JSON_no_out(nvbench::state& state) auto parser = cudf::io::fst::detail::make_fst( cudf::io::fst::detail::make_symbol_group_lut(pda_sgs), cudf::io::fst::detail::make_transition_table(pda_state_tt), - cudf::io::fst::detail::make_translation_table(pda_out_tt), + cudf::io::fst::detail::make_translation_table(pda_out_tt), stream); state.set_cuda_stream(nvbench::make_cuda_stream_view(stream.value())); @@ -209,7 +215,9 @@ void BM_FST_JSON_no_str(nvbench::state& state) auto parser = cudf::io::fst::detail::make_fst( cudf::io::fst::detail::make_symbol_group_lut(pda_sgs), cudf::io::fst::detail::make_transition_table(pda_state_tt), - cudf::io::fst::detail::make_translation_table(pda_out_tt), + cudf::io::fst::detail::make_translation_table(pda_out_tt), stream); state.set_cuda_stream(nvbench::make_cuda_stream_view(stream.value())); diff --git a/cpp/src/io/fst/agent_dfa.cuh b/cpp/src/io/fst/agent_dfa.cuh index 2171764decd..14b255e03e6 100644 --- a/cpp/src/io/fst/agent_dfa.cuh +++ b/cpp/src/io/fst/agent_dfa.cuh @@ -15,6 +15,7 @@ */ #pragma once +#include "cub/util_type.cuh" #include "in_reg_array.cuh" #include @@ -44,9 +45,10 @@ using StateIndexT = uint32_t; template struct VectorCompositeOp { template - __host__ __device__ __forceinline__ VectorT operator()(VectorT const& lhs, VectorT const& rhs) + __device__ __forceinline__ VectorT operator()(VectorT const& lhs, VectorT const& rhs) { VectorT res{}; +#pragma unroll for (int32_t i = 0; i < NUM_ITEMS; ++i) { res.Set(i, rhs.Get(lhs.Get(i))); } @@ -58,49 +60,136 @@ struct VectorCompositeOp { * @brief A class whose ReadSymbol member function is invoked for each symbol being read from the * input tape. The wrapper class looks up whether a state transition caused by a symbol is supposed * to emit any output symbol (the "transduced" output) and, if so, keeps track of how many symbols - * it intends to write out and writing out such symbols to the given output iterators. - * - * @tparam TransducerTableT The type implementing a transducer table that can be used for looking up - * the symbols that are supposed to be emitted on a given state transition. - * @tparam TransducedOutItT A Random-access output iterator type to which symbols returned by the - * transducer table are assignable. - * @tparam TransducedIndexOutItT A Random-access output iterator type to which indexes are written. + * it intends to write out. */ -template -class DFASimulationCallbackWrapper { +template +class DFACountCallbackWrapper { + public: + __device__ __forceinline__ DFACountCallbackWrapper(TransducerTableT transducer_table) + : transducer_table(transducer_table) + { + } + + template + __device__ __forceinline__ void Init(OffsetT const&) + { + out_count = 0; + } + + template + __device__ __forceinline__ void ReadSymbol(CharIndexT const character_index, + StateIndexT const old_state, + StateIndexT const new_state, + SymbolIndexT const symbol_id, + SymbolT const read_symbol) + { + uint32_t const count = transducer_table(old_state, symbol_id, read_symbol); + out_count += count; + } + + __host__ __device__ __forceinline__ void TearDown() {} + TransducerTableT const transducer_table; + uint32_t out_count{}; +}; + +template +class DFASWriteCallbackWrapper { public: - __host__ __device__ __forceinline__ DFASimulationCallbackWrapper( - TransducerTableT transducer_table, TransducedOutItT out_it, TransducedIndexOutItT out_idx_it) - : transducer_table(transducer_table), out_it(out_it), out_idx_it(out_idx_it), write(false) + __device__ __forceinline__ DFASWriteCallbackWrapper(TransducerTableT transducer_table, + TransducedOutItT out_it, + TransducedIndexOutItT out_idx_it, + uint32_t out_offset, + uint32_t, + uint32_t, + uint32_t) + : transducer_table(transducer_table), + out_it(out_it), + out_idx_it(out_idx_it), + out_offset(out_offset) { } template - __host__ __device__ __forceinline__ void Init(OffsetT const& offset) + __device__ __forceinline__ void Init(OffsetT const& in_offset) { - this->offset = offset; - if (!write) out_count = 0; + this->in_offset = in_offset; } template - __host__ __device__ __forceinline__ void ReadSymbol(CharIndexT const character_index, - StateIndexT const old_state, - StateIndexT const new_state, - SymbolIndexT const symbol_id, - SymbolT const read_symbol) + __device__ __forceinline__ void ReadSymbol(CharIndexT const character_index, + StateIndexT const old_state, + StateIndexT const new_state, + SymbolIndexT const symbol_id, + SymbolT const read_symbol, + cub::Int2Type<1>) { uint32_t const count = transducer_table(old_state, symbol_id, read_symbol); - if (write) { -#if defined(__CUDA_ARCH__) -#pragma unroll 1 -#endif - for (uint32_t out_char = 0; out_char < count; out_char++) { - out_it[out_count + out_char] = + +#pragma unroll + for (uint32_t out_char = 0; out_char < 1; out_char++) { + if (out_char < count) { + out_it[out_offset + out_char] = transducer_table(old_state, symbol_id, out_char, read_symbol); - out_idx_it[out_count + out_char] = offset + character_index; + out_idx_it[out_offset + out_char] = in_offset + character_index; } } - out_count += count; + out_offset += count; + } + + template + __device__ __forceinline__ void ReadSymbol(CharIndexT const character_index, + StateIndexT const old_state, + StateIndexT const new_state, + SymbolIndexT const symbol_id, + SymbolT const read_symbol, + cub::Int2Type<2>) + { + uint32_t const count = transducer_table(old_state, symbol_id, read_symbol); + +#pragma unroll + for (uint32_t out_char = 0; out_char < 2; out_char++) { + if (out_char < count) { + out_it[out_offset + out_char] = + transducer_table(old_state, symbol_id, out_char, read_symbol); + out_idx_it[out_offset + out_char] = in_offset + character_index; + } + } + out_offset += count; + } + + template + __device__ __forceinline__ void ReadSymbol(CharIndexT const character_index, + StateIndexT const old_state, + StateIndexT const new_state, + SymbolIndexT const symbol_id, + SymbolT const read_symbol, + cub::Int2Type) + { + uint32_t const count = transducer_table(old_state, symbol_id, read_symbol); + + for (uint32_t out_char = 0; out_char < count; out_char++) { + out_it[out_offset + out_char] = transducer_table(old_state, symbol_id, out_char, read_symbol); + out_idx_it[out_offset + out_char] = in_offset + character_index; + } + out_offset += count; + } + + template + __device__ __forceinline__ void ReadSymbol(CharIndexT const character_index, + StateIndexT const old_state, + StateIndexT const new_state, + SymbolIndexT const symbol_id, + SymbolT const read_symbol) + { + ReadSymbol( + character_index, old_state, new_state, symbol_id, read_symbol, cub::Int2Type{}); } __host__ __device__ __forceinline__ void TearDown() {} @@ -109,9 +198,95 @@ class DFASimulationCallbackWrapper { TransducerTableT const transducer_table; TransducedOutItT out_it; TransducedIndexOutItT out_idx_it; - uint32_t out_count; - uint32_t offset; - bool write; + uint32_t out_offset; + uint32_t in_offset; +}; + +template +class WriteCoalescingCallbackWrapper { + struct TempStorage_ { + uint16_t compacted_offset[cache_size]; + out_t compacted_symbols[cache_size]; + }; + + struct TempStorage : cub::Uninitialized {}; + + __device__ __forceinline__ TempStorage_& PrivateStorage() + { + __shared__ TempStorage private_storage; + return private_storage.Alias(); + } + TempStorage_& temp_storage; + + public: + __device__ __forceinline__ WriteCoalescingCallbackWrapper(TransducerTableT transducer_table, + TransducedOutItT out_it, + TransducedIndexOutItT out_idx_it, + uint32_t thread_out_offset, + uint32_t tile_out_offset, + uint32_t tile_in_offset, + uint32_t tile_out_count) + : temp_storage(PrivateStorage()), + transducer_table(transducer_table), + out_it(out_it), + out_idx_it(out_idx_it), + thread_out_offset(thread_out_offset), + tile_out_offset(tile_out_offset), + tile_in_offset(tile_in_offset), + tile_out_count(tile_out_count) + { + } + + template + __device__ __forceinline__ void Init(OffsetT const& offset) + { + this->in_offset = offset; + } + + template + __device__ __forceinline__ void ReadSymbol(CharIndexT const character_index, + StateIndexT const old_state, + StateIndexT const new_state, + SymbolIndexT const symbol_id, + SymbolT const read_symbol) + { + uint32_t const count = transducer_table(old_state, symbol_id, read_symbol); + for (uint32_t out_char = 0; out_char < count; out_char++) { + temp_storage.compacted_offset[thread_out_offset + out_char - tile_out_offset] = + in_offset + character_index - tile_in_offset; + temp_storage.compacted_symbols[thread_out_offset + out_char - tile_out_offset] = + transducer_table(old_state, symbol_id, out_char, read_symbol); + } + thread_out_offset += count; + } + + __device__ __forceinline__ void TearDown() + { + __syncthreads(); + for (uint32_t out_char = threadIdx.x; out_char < tile_out_count; out_char += blockDim.x) { + out_it[tile_out_offset + out_char] = temp_storage.compacted_symbols[out_char]; + } + for (uint32_t out_char = threadIdx.x; out_char < tile_out_count; out_char += blockDim.x) { + out_idx_it[tile_out_offset + out_char] = + temp_storage.compacted_offset[out_char] + tile_in_offset; + } + __syncthreads(); + } + + public: + TransducerTableT const transducer_table; + TransducedOutItT out_it; + TransducedIndexOutItT out_idx_it; + uint32_t thread_out_offset; + uint32_t tile_out_offset; + uint32_t tile_in_offset; + uint32_t in_offset; + uint32_t tile_out_count; }; /** @@ -125,17 +300,18 @@ class DFASimulationCallbackWrapper { template class StateVectorTransitionOp { public: - __host__ __device__ __forceinline__ StateVectorTransitionOp( + __device__ __forceinline__ StateVectorTransitionOp( TransitionTableT const& transition_table, std::array& state_vector) : transition_table(transition_table), state_vector(state_vector) { } template - __host__ __device__ __forceinline__ void ReadSymbol(CharIndexT const& character_index, - SymbolIndexT const& read_symbol_id, - SymbolT const& read_symbol) const + __device__ __forceinline__ void ReadSymbol(CharIndexT const& character_index, + SymbolIndexT const& read_symbol_id, + SymbolT const& read_symbol) const { +#pragma unroll for (int32_t i = 0; i < NUM_INSTANCES; ++i) { state_vector[i] = transition_table(state_vector[i], read_symbol_id); } @@ -152,17 +328,17 @@ struct StateTransitionOp { TransitionTableT const& transition_table; CallbackOpT& callback_op; - __host__ __device__ __forceinline__ StateTransitionOp(TransitionTableT const& transition_table, - StateIndexT state, - CallbackOpT& callback_op) + __device__ __forceinline__ StateTransitionOp(TransitionTableT const& transition_table, + StateIndexT state, + CallbackOpT& callback_op) : transition_table(transition_table), state(state), callback_op(callback_op) { } template - __host__ __device__ __forceinline__ void ReadSymbol(CharIndexT const& character_index, - SymbolIndexT const& read_symbol_id, - SymbolT const& read_symbol) + __device__ __forceinline__ void ReadSymbol(CharIndexT const& character_index, + SymbolIndexT const& read_symbol_id, + SymbolT const& read_symbol) { // Remember what state we were in before we made the transition StateIndexT previous_state = state; @@ -420,7 +596,7 @@ struct AgentDFA { __syncthreads(); // Thread's symbols - CharT* t_chars = &temp_storage.chars[threadIdx.x * SYMBOLS_PER_THREAD]; + CharT const* t_chars = &temp_storage.chars[threadIdx.x * SYMBOLS_PER_THREAD]; // Parse thread's symbols and transition the state-vector if (is_full_block) { @@ -538,6 +714,33 @@ __launch_bounds__(int32_t(AgentDFAPolicy::BLOCK_THREADS)) CUDF_KERNEL // The state transition vector passed on to the second stage of the algorithm StateVectorT out_state_vector; + using OutSymbolT = typename DfaT::OutSymbolT; + // static constexpr int32_t MIN_TRANSLATED_OUT = DfaT::MIN_TRANSLATED_OUT; + static constexpr int32_t MAX_TRANSLATED_OUT = DfaT::MAX_TRANSLATED_OUT; + using NonWriteCoalescingT = + DFASWriteCallbackWrapper; + + using WriteCoalescingT = + WriteCoalescingCallbackWrapper; + + // static constexpr bool is_mapping_fst = (MIN_TRANSLATED_OUT == 1) and (MAX_TRANSLATED_OUT == 1); + static constexpr bool is_translation_pass = (!IS_TRANS_VECTOR_PASS) || IS_SINGLE_PASS; + + // Use write-coalescing only if it's + static constexpr bool use_shmem_cache = + is_translation_pass and (sizeof(OutSymbolT) * MAX_TRANSLATED_OUT <= 4); + + using DFASimulationCallbackWrapperT = + typename cub::If::Type; + // Stage 1: Compute the state-transition vector if (IS_TRANS_VECTOR_PASS || IS_SINGLE_PASS) { // Keeping track of the state for each of the state machines @@ -576,7 +779,7 @@ __launch_bounds__(int32_t(AgentDFAPolicy::BLOCK_THREADS)) CUDF_KERNEL // -> first block/tile: write out block aggregate as the "tile's" inclusive (i.e., the one that // incorporates all preceding blocks/tiles results) //------------------------------------------------------------------------------ - if (IS_SINGLE_PASS) { + if constexpr (IS_SINGLE_PASS) { uint32_t tile_idx = blockIdx.x; using StateVectorCompositeOpT = VectorCompositeOp; @@ -623,10 +826,7 @@ __launch_bounds__(int32_t(AgentDFAPolicy::BLOCK_THREADS)) CUDF_KERNEL } // Perform finite-state machine simulation, computing size of transduced output - DFASimulationCallbackWrapper - callback_wrapper(transducer_table, transduced_out_it, transduced_out_idx_it); + DFACountCallbackWrapper count_chars_writte_callback_op{transducer_table}; StateIndexT t_start_state = state; agent_dfa.GetThreadStateTransitions(symbol_matcher, @@ -635,7 +835,7 @@ __launch_bounds__(int32_t(AgentDFAPolicy::BLOCK_THREADS)) CUDF_KERNEL blockIdx.x * SYMBOLS_PER_BLOCK, num_chars, state, - callback_wrapper, + count_chars_writte_callback_op, cub::Int2Type()); __syncthreads(); @@ -650,15 +850,18 @@ __launch_bounds__(int32_t(AgentDFAPolicy::BLOCK_THREADS)) CUDF_KERNEL __shared__ typename OffsetPrefixScanCallbackOpT_::TempStorage prefix_callback_temp_storage; uint32_t tile_idx = blockIdx.x; + uint32_t tile_out_offset{}; + uint32_t tile_out_count{}; + uint32_t thread_out_offset{}; if (tile_idx == 0) { OffsetT block_aggregate = 0; OutOffsetBlockScan(scan_temp_storage) - .ExclusiveScan(callback_wrapper.out_count, - callback_wrapper.out_count, + .ExclusiveScan(count_chars_writte_callback_op.out_count, + thread_out_offset, static_cast(0), cub::Sum{}, block_aggregate); - + tile_out_count = block_aggregate; if (threadIdx.x == 0 /*and not IS_LAST_TILE*/) { offset_tile_state.SetInclusive(0, block_aggregate); } @@ -672,21 +875,28 @@ __launch_bounds__(int32_t(AgentDFAPolicy::BLOCK_THREADS)) CUDF_KERNEL OutOffsetBlockScan(scan_temp_storage) .ExclusiveScan( - callback_wrapper.out_count, callback_wrapper.out_count, cub::Sum{}, prefix_op); - + count_chars_writte_callback_op.out_count, thread_out_offset, cub::Sum{}, prefix_op); + tile_out_offset = prefix_op.GetExclusivePrefix(); + tile_out_count = prefix_op.GetBlockAggregate(); if (tile_idx == gridDim.x - 1 && threadIdx.x == 0) { *d_num_transduced_out_it = prefix_op.GetInclusivePrefix(); } } - callback_wrapper.write = true; + DFASimulationCallbackWrapperT write_translated_callback_op{transducer_table, + transduced_out_it, + transduced_out_idx_it, + thread_out_offset, + tile_out_offset, + blockIdx.x * SYMBOLS_PER_BLOCK, + tile_out_count}; agent_dfa.GetThreadStateTransitions(symbol_matcher, transition_table, d_chars, blockIdx.x * SYMBOLS_PER_BLOCK, num_chars, t_start_state, - callback_wrapper, + write_translated_callback_op, cub::Int2Type()); } } diff --git a/cpp/src/io/fst/lookup_tables.cuh b/cpp/src/io/fst/lookup_tables.cuh index 5532a7f994b..cce41c395a4 100644 --- a/cpp/src/io/fst/lookup_tables.cuh +++ b/cpp/src/io/fst/lookup_tables.cuh @@ -367,18 +367,18 @@ class TransitionTable { template static KernelParameter InitDeviceTransitionTable( - std::array, MAX_NUM_STATES> const& translation_table) + std::array, MAX_NUM_STATES> const& transition_table) { KernelParameter init_data{}; - // translation_table[state][symbol] -> new state - for (std::size_t state = 0; state < translation_table.size(); ++state) { - for (std::size_t symbol = 0; symbol < translation_table[state].size(); ++symbol) { + // transition_table[state][symbol] -> new state + for (std::size_t state = 0; state < transition_table.size(); ++state) { + for (std::size_t symbol = 0; symbol < transition_table[state].size(); ++symbol) { CUDF_EXPECTS( - static_cast(translation_table[state][symbol]) <= + static_cast(transition_table[state][symbol]) <= std::numeric_limits::max(), "Target state index value exceeds value representable by the transition table's type"); init_data.transitions[symbol * MAX_NUM_STATES + state] = - static_cast(translation_table[state][symbol]); + static_cast(transition_table[state][symbol]); } } @@ -494,6 +494,10 @@ class dfa_device_view { // This is a value queried by the DFA simulation algorithm static constexpr int32_t MAX_NUM_STATES = NUM_STATES; + using OutSymbolT = typename TranslationTableT::OutSymbolT; + static constexpr int32_t MIN_TRANSLATED_OUT = TranslationTableT::MIN_TRANSLATED_OUT; + static constexpr int32_t MAX_TRANSLATED_OUT = TranslationTableT::MAX_TRANSLATED_OUT; + using SymbolGroupStorageT = std::conditional_t::value, typename SymbolGroupIdLookupT::TempStorage, typename cub::NullType>; @@ -542,24 +546,33 @@ class dfa_device_view { * @tparam OutSymbolT The symbol type being output * @tparam OutSymbolOffsetT Type sufficiently large to index into the lookup table of output * symbols - * @tparam MAX_NUM_SYMBOLS The maximum number of symbols being output by a single state transition + * @tparam MAX_NUM_SYMBOLS The maximum number of symbol groups supported by this lookup table * @tparam MAX_NUM_STATES The maximum number of states that this lookup table shall support + * @tparam MIN_TRANSLATED_OUT_ The minimum number of symbols being output by a single state + * transition + * @tparam MAX_TRANSLATED_OUT_ The maximum number of symbols being output by a single state + * transition * @tparam MAX_TABLE_SIZE The maximum number of items in the lookup table of output symbols - * be used. */ -template class TransducerLookupTable { private: struct _TempStorage { OutSymbolOffsetT out_offset[MAX_NUM_STATES * MAX_NUM_SYMBOLS + 1]; - OutSymbolT out_symbols[MAX_TABLE_SIZE]; + OutSymbolT_ out_symbols[MAX_TABLE_SIZE]; }; public: + using OutSymbolT = OutSymbolT_; + static constexpr int32_t MIN_TRANSLATED_OUT = MIN_TRANSLATED_OUT_; + static constexpr int32_t MAX_TRANSLATED_OUT = MAX_TRANSLATED_OUT_; + using TempStorage = cub::Uninitialized<_TempStorage>; struct KernelParameter { @@ -567,6 +580,8 @@ class TransducerLookupTable { OutSymbolOffsetT, MAX_NUM_SYMBOLS, MAX_NUM_STATES, + MIN_TRANSLATED_OUT, + MAX_TRANSLATED_OUT, MAX_TABLE_SIZE>; OutSymbolOffsetT d_out_offsets[MAX_NUM_STATES * MAX_NUM_SYMBOLS + 1]; @@ -686,14 +701,19 @@ class TransducerLookupTable { * sequence of symbols that the finite-state transducer is supposed to output for each transition. * * @tparam MAX_TABLE_SIZE The maximum number of items in the lookup table of output symbols - * be used + * @tparam MIN_TRANSLATED_OUT The minimum number of symbols being output by a single state + * transition + * @tparam MAX_TRANSLATED_OUT The maximum number of symbols being output by a single state + * transition * @tparam OutSymbolT The symbol type being output - * @tparam MAX_NUM_SYMBOLS The maximum number of symbols being output by a single state transition + * @tparam MAX_NUM_SYMBOLS The maximum number of symbol groups supported by this lookup table * @tparam MAX_NUM_STATES The maximum number of states that this lookup table shall support * @param translation_table The translation table * @return A translation table of type `TransducerLookupTable`. */ template @@ -705,20 +725,30 @@ auto make_translation_table(std::array, MAX_N OutSymbolOffsetT, MAX_NUM_SYMBOLS, MAX_NUM_STATES, + MIN_TRANSLATED_OUT, + MAX_TRANSLATED_OUT, MAX_TABLE_SIZE>; return translation_table_t::InitDeviceTranslationTable(translation_table); } -template +template class TranslationOp { private: struct _TempStorage {}; public: + using OutSymbolT = OutSymbolT_; + static constexpr int32_t MIN_TRANSLATED_OUT = MIN_TRANSLATED_OUT_; + static constexpr int32_t MAX_TRANSLATED_OUT = MAX_TRANSLATED_OUT_; + using TempStorage = cub::Uninitialized<_TempStorage>; struct KernelParameter { - using LookupTableT = TranslationOp; + using LookupTableT = + TranslationOp; TranslationOpT translation_op; }; @@ -772,6 +802,8 @@ class TranslationOp { * * @tparam FunctorT A function object type that must implement two signatures: (1) with `(state_id, * match_id, read_symbol)` and (2) with `(state_id, match_id, relative_offset, read_symbol)` + * @tparam MAX_TRANSLATED_SYMBOLS The maximum number of translated output symbols for any given + * input symbol * @param map_op A function object that must implement two signatures: (1) with `(state_id, * match_id, read_symbol)` and (2) with `(state_id, match_id, relative_offset, read_symbol)`. * Invocations of the first signature, (1), must return the number of symbols that are emitted for @@ -779,10 +811,14 @@ class TranslationOp { * that transition, where `i` corresponds to `relative_offse` * @return A translation table of type `TranslationO` */ -template +template auto make_translation_functor(FunctorT map_op) { - return TranslationOp::InitDeviceTranslationTable(map_op); + return TranslationOp:: + InitDeviceTranslationTable(map_op); } /** diff --git a/cpp/src/io/json/json_normalization.cu b/cpp/src/io/json/json_normalization.cu index ca56a12eb36..760b2214365 100644 --- a/cpp/src/io/json/json_normalization.cu +++ b/cpp/src/io/json/json_normalization.cu @@ -302,11 +302,14 @@ void normalize_single_quotes(datasource::owning_buffer( + normalize_quotes::TransduceToNormalizedQuotes{}), + stream); rmm::device_uvector outbuf(indata.size() * 2, stream, mr); rmm::device_scalar outbuf_size(stream, mr); @@ -327,11 +330,14 @@ void normalize_whitespace(datasource::owning_buffer rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { - auto parser = fst::detail::make_fst( - fst::detail::make_symbol_group_lut(normalize_whitespace::wna_sgs), - fst::detail::make_transition_table(normalize_whitespace::wna_state_tt), - fst::detail::make_translation_functor(normalize_whitespace::TransduceToNormalizedWS{}), - stream); + static constexpr std::int32_t min_out = 0; + static constexpr std::int32_t max_out = 2; + auto parser = + fst::detail::make_fst(fst::detail::make_symbol_group_lut(normalize_whitespace::wna_sgs), + fst::detail::make_transition_table(normalize_whitespace::wna_state_tt), + fst::detail::make_translation_functor( + normalize_whitespace::TransduceToNormalizedWS{}), + stream); rmm::device_uvector outbuf(indata.size(), stream, mr); rmm::device_scalar outbuf_size(stream, mr); diff --git a/cpp/src/io/json/nested_json_gpu.cu b/cpp/src/io/json/nested_json_gpu.cu index 8da1bb3ddfc..d896dd95787 100644 --- a/cpp/src/io/json/nested_json_gpu.cu +++ b/cpp/src/io/json/nested_json_gpu.cu @@ -1433,10 +1433,15 @@ void get_stack_context(device_span json_in, ? to_stack_op::resetting_translation_table : to_stack_op::translation_table; + static constexpr auto min_translated_out = 0; + static constexpr auto max_translated_out = 1; + auto json_to_stack_ops_fst = fst::detail::make_fst( fst::detail::make_symbol_group_lut(to_stack_op::symbol_groups), fst::detail::make_transition_table(transition_table), - fst::detail::make_translation_table(translation_table), + fst::detail::make_translation_table(translation_table), stream); // "Search" for relevant occurrence of brackets and braces that indicate the beginning/end @@ -1484,11 +1489,12 @@ std::pair, rmm::device_uvector> pr // Instantiate FST for post-processing the token stream to remove all tokens that belong to an // invalid JSON line token_filter::UnwrapTokenFromSymbolOp sgid_op{}; - auto filter_fst = - fst::detail::make_fst(fst::detail::make_symbol_group_lut(token_filter::symbol_groups, sgid_op), - fst::detail::make_transition_table(token_filter::transition_table), - fst::detail::make_translation_functor(token_filter::TransduceToken{}), - stream); + using symbol_t = thrust::tuple; + auto filter_fst = fst::detail::make_fst( + fst::detail::make_symbol_group_lut(token_filter::symbol_groups, sgid_op), + fst::detail::make_transition_table(token_filter::transition_table), + fst::detail::make_translation_functor(token_filter::TransduceToken{}), + stream); auto const mr = rmm::mr::get_current_device_resource(); rmm::device_scalar d_num_selected_tokens(stream, mr); @@ -1575,7 +1581,8 @@ std::pair, rmm::device_uvector> ge fst::detail::make_symbol_group_lookup_op( fix_stack_of_excess_chars::SymbolPairToSymbolGroupId{}), fst::detail::make_transition_table(fix_stack_of_excess_chars::transition_table), - fst::detail::make_translation_functor(fix_stack_of_excess_chars::TransduceInputOp{}), + fst::detail::make_translation_functor( + fix_stack_of_excess_chars::TransduceInputOp{}), stream); fix_stack_of_excess_chars.Transduce(zip_in, static_cast(json_in.size()), @@ -1595,7 +1602,7 @@ std::pair, rmm::device_uvector> ge auto json_to_tokens_fst = fst::detail::make_fst( fst::detail::make_symbol_group_lookup_op(tokenizer_pda::PdaSymbolToSymbolGroupId{}), fst::detail::make_transition_table(tokenizer_pda::get_transition_table(format)), - fst::detail::make_translation_table( + fst::detail::make_translation_table( tokenizer_pda::get_translation_table(recover_from_error)), stream); diff --git a/cpp/tests/io/fst/common.hpp b/cpp/tests/io/fst/common.hpp index 382d21fabb8..47ef6a1a0de 100644 --- a/cpp/tests/io/fst/common.hpp +++ b/cpp/tests/io/fst/common.hpp @@ -69,6 +69,8 @@ std::array, TT_NUM_STATES> const pda_s /* TT_ESC */ {{TT_STR, TT_STR, TT_STR, TT_STR, TT_STR, TT_STR, TT_STR}}}}; // Translation table (i.e., for each transition, what are the symbols that we output) +static constexpr auto min_translated_out = 1; +static constexpr auto max_translated_out = 1; std::array, NUM_SYMBOL_GROUPS>, TT_NUM_STATES> const pda_out_tt{ {/* IN_STATE { [ } ] " \ OTHER */ /* TT_OOS */ {{{'{'}, {'['}, {'}'}, {']'}, {'x'}, {'x'}, {'x'}}}, diff --git a/cpp/tests/io/fst/fst_test.cu b/cpp/tests/io/fst/fst_test.cu index 4df0d3ae04d..8a8d3d39e0f 100644 --- a/cpp/tests/io/fst/fst_test.cu +++ b/cpp/tests/io/fst/fst_test.cu @@ -169,7 +169,9 @@ TEST_F(FstTest, GroundTruth) auto parser = cudf::io::fst::detail::make_fst( cudf::io::fst::detail::make_symbol_group_lut(pda_sgs), cudf::io::fst::detail::make_transition_table(pda_state_tt), - cudf::io::fst::detail::make_translation_table(pda_out_tt), + cudf::io::fst::detail::make_translation_table(pda_out_tt), stream); // Allocate device-side temporary storage & run algorithm From a3863c5f338dfad91bb321e4734b4d7c709c23ea Mon Sep 17 00:00:00 2001 From: Elias Stehle <3958403+elstehle@users.noreply.github.com> Date: Mon, 13 May 2024 02:33:06 -0700 Subject: [PATCH 2/7] tune fst --- cpp/src/io/fst/dispatch_dfa.cuh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/io/fst/dispatch_dfa.cuh b/cpp/src/io/fst/dispatch_dfa.cuh index be63ec6539f..1fb86465ef8 100644 --- a/cpp/src/io/fst/dispatch_dfa.cuh +++ b/cpp/src/io/fst/dispatch_dfa.cuh @@ -49,7 +49,7 @@ struct DeviceFSMPolicy { struct Policy900 : cub::ChainedPolicy<900, Policy900, Policy900> { enum { BLOCK_THREADS = 128, - ITEMS_PER_THREAD = 32, + ITEMS_PER_THREAD = 16, }; using AgentDFAPolicy = AgentDFAPolicy; From 5967d7b4007c47fe8a12dd32b2161d839b002f54 Mon Sep 17 00:00:00 2001 From: Elias Stehle <3958403+elstehle@users.noreply.github.com> Date: Mon, 13 May 2024 03:16:55 -0700 Subject: [PATCH 3/7] short-circuit on discard iterators --- cpp/src/io/fst/agent_dfa.cuh | 56 +++++++++++++++++++++++++----------- 1 file changed, 40 insertions(+), 16 deletions(-) diff --git a/cpp/src/io/fst/agent_dfa.cuh b/cpp/src/io/fst/agent_dfa.cuh index 14b255e03e6..3df5a988ef2 100644 --- a/cpp/src/io/fst/agent_dfa.cuh +++ b/cpp/src/io/fst/agent_dfa.cuh @@ -15,11 +15,12 @@ */ #pragma once -#include "cub/util_type.cuh" #include "in_reg_array.cuh" #include +#include #include +#include #include namespace cudf::io::fst::detail { @@ -202,19 +203,26 @@ class DFASWriteCallbackWrapper { uint32_t in_offset; }; -template class WriteCoalescingCallbackWrapper { - struct TempStorage_ { + struct TempStorage_Offsets { uint16_t compacted_offset[cache_size]; + }; + struct TempStorage_Symbols { out_t compacted_symbols[cache_size]; }; - - struct TempStorage : cub::Uninitialized {}; + using offset_cache_t = + ::cuda::std::conditional_t; + using symbol_cache_t = + ::cuda::std::conditional_t, TempStorage_Symbols>; + struct TempStorage_ : offset_cache_t, symbol_cache_t {}; __device__ __forceinline__ TempStorage_& PrivateStorage() { @@ -224,6 +232,8 @@ class WriteCoalescingCallbackWrapper { TempStorage_& temp_storage; public: + struct TempStorage : cub::Uninitialized {}; + __device__ __forceinline__ WriteCoalescingCallbackWrapper(TransducerTableT transducer_table, TransducedOutItT out_it, TransducedIndexOutItT out_idx_it, @@ -257,10 +267,14 @@ class WriteCoalescingCallbackWrapper { { uint32_t const count = transducer_table(old_state, symbol_id, read_symbol); for (uint32_t out_char = 0; out_char < count; out_char++) { - temp_storage.compacted_offset[thread_out_offset + out_char - tile_out_offset] = - in_offset + character_index - tile_in_offset; - temp_storage.compacted_symbols[thread_out_offset + out_char - tile_out_offset] = - transducer_table(old_state, symbol_id, out_char, read_symbol); + if constexpr (!discard_idx) { + temp_storage.compacted_offset[thread_out_offset + out_char - tile_out_offset] = + in_offset + character_index - tile_in_offset; + } + if constexpr (!discard_out) { + temp_storage.compacted_symbols[thread_out_offset + out_char - tile_out_offset] = + transducer_table(old_state, symbol_id, out_char, read_symbol); + } } thread_out_offset += count; } @@ -268,12 +282,16 @@ class WriteCoalescingCallbackWrapper { __device__ __forceinline__ void TearDown() { __syncthreads(); - for (uint32_t out_char = threadIdx.x; out_char < tile_out_count; out_char += blockDim.x) { - out_it[tile_out_offset + out_char] = temp_storage.compacted_symbols[out_char]; + if constexpr (!discard_out) { + for (uint32_t out_char = threadIdx.x; out_char < tile_out_count; out_char += blockDim.x) { + out_it[tile_out_offset + out_char] = temp_storage.compacted_symbols[out_char]; + } } - for (uint32_t out_char = threadIdx.x; out_char < tile_out_count; out_char += blockDim.x) { - out_idx_it[tile_out_offset + out_char] = - temp_storage.compacted_offset[out_char] + tile_in_offset; + if constexpr (!discard_idx) { + for (uint32_t out_char = threadIdx.x; out_char < tile_out_count; out_char += blockDim.x) { + out_idx_it[tile_out_offset + out_char] = + temp_storage.compacted_offset[out_char] + tile_in_offset; + } } __syncthreads(); } @@ -717,6 +735,10 @@ __launch_bounds__(int32_t(AgentDFAPolicy::BLOCK_THREADS)) CUDF_KERNEL using OutSymbolT = typename DfaT::OutSymbolT; // static constexpr int32_t MIN_TRANSLATED_OUT = DfaT::MIN_TRANSLATED_OUT; static constexpr int32_t MAX_TRANSLATED_OUT = DfaT::MAX_TRANSLATED_OUT; + static constexpr bool discard_out_index = + ::cuda::std::is_same>::value; + static constexpr bool discard_out_it = + ::cuda::std::is_same>::value; using NonWriteCoalescingT = DFASWriteCallbackWrapper; using WriteCoalescingT = - WriteCoalescingCallbackWrapper::Type; From bbf56f3c177be75b4bbb07f32546e9222ab62cd1 Mon Sep 17 00:00:00 2001 From: Elias Stehle <3958403+elstehle@users.noreply.github.com> Date: Mon, 1 Jul 2024 04:55:00 -0700 Subject: [PATCH 4/7] fixes style --- cpp/src/io/fst/agent_dfa.cuh | 161 ++++++++++++++++------------- cpp/src/io/json/nested_json_gpu.cu | 7 +- 2 files changed, 95 insertions(+), 73 deletions(-) diff --git a/cpp/src/io/fst/agent_dfa.cuh b/cpp/src/io/fst/agent_dfa.cuh index 3df5a988ef2..4b2772fcec8 100644 --- a/cpp/src/io/fst/agent_dfa.cuh +++ b/cpp/src/io/fst/agent_dfa.cuh @@ -60,7 +60,7 @@ struct VectorCompositeOp { /** * @brief A class whose ReadSymbol member function is invoked for each symbol being read from the * input tape. The wrapper class looks up whether a state transition caused by a symbol is supposed - * to emit any output symbol (the "transduced" output) and, if so, keeps track of how many symbols + * to emit any output symbol (the "transduced" output) and, if so, keeps track of *how many* symbols * it intends to write out. */ template @@ -88,12 +88,26 @@ class DFACountCallbackWrapper { out_count += count; } - __host__ __device__ __forceinline__ void TearDown() {} + __device__ __forceinline__ void TearDown() {} TransducerTableT const transducer_table; uint32_t out_count{}; }; -template @@ -103,9 +117,9 @@ class DFASWriteCallbackWrapper { TransducedOutItT out_it, TransducedIndexOutItT out_idx_it, uint32_t out_offset, - uint32_t, - uint32_t, - uint32_t) + uint32_t /*tile_out_offset*/, + uint32_t /*tile_in_offset*/, + uint32_t /*tile_out_count*/) : transducer_table(transducer_table), out_it(out_it), out_idx_it(out_idx_it), @@ -119,39 +133,24 @@ class DFASWriteCallbackWrapper { this->in_offset = in_offset; } - template - __device__ __forceinline__ void ReadSymbol(CharIndexT const character_index, - StateIndexT const old_state, - StateIndexT const new_state, - SymbolIndexT const symbol_id, - SymbolT const read_symbol, - cub::Int2Type<1>) - { - uint32_t const count = transducer_table(old_state, symbol_id, read_symbol); - -#pragma unroll - for (uint32_t out_char = 0; out_char < 1; out_char++) { - if (out_char < count) { - out_it[out_offset + out_char] = - transducer_table(old_state, symbol_id, out_char, read_symbol); - out_idx_it[out_offset + out_char] = in_offset + character_index; - } - } - out_offset += count; - } - - template - __device__ __forceinline__ void ReadSymbol(CharIndexT const character_index, - StateIndexT const old_state, - StateIndexT const new_state, - SymbolIndexT const symbol_id, - SymbolT const read_symbol, - cub::Int2Type<2>) + template + __device__ __forceinline__ + typename ::cuda::std::enable_if<(MaxTranslatedOutChars_ <= 2), void>::type + ReadSymbol(CharIndexT const character_index, + StateIndexT const old_state, + StateIndexT const new_state, + SymbolIndexT const symbol_id, + SymbolT const read_symbol, + cub::Int2Type /*MaxTranslatedOutChars*/) { uint32_t const count = transducer_table(old_state, symbol_id, read_symbol); #pragma unroll - for (uint32_t out_char = 0; out_char < 2; out_char++) { + for (uint32_t out_char = 0; out_char < MaxTranslatedOutChars_; out_char++) { if (out_char < count) { out_it[out_offset + out_char] = transducer_table(old_state, symbol_id, out_char, read_symbol); @@ -165,13 +164,15 @@ class DFASWriteCallbackWrapper { typename StateIndexT, typename SymbolIndexT, typename SymbolT, - int N> - __device__ __forceinline__ void ReadSymbol(CharIndexT const character_index, - StateIndexT const old_state, - StateIndexT const new_state, - SymbolIndexT const symbol_id, - SymbolT const read_symbol, - cub::Int2Type) + int MaxTranslatedOutChars_> + __device__ __forceinline__ + typename ::cuda::std::enable_if<(MaxTranslatedOutChars_ > 2), void>::type + ReadSymbol(CharIndexT const character_index, + StateIndexT const old_state, + StateIndexT const new_state, + SymbolIndexT const symbol_id, + SymbolT const read_symbol, + cub::Int2Type) { uint32_t const count = transducer_table(old_state, symbol_id, read_symbol); @@ -189,11 +190,15 @@ class DFASWriteCallbackWrapper { SymbolIndexT const symbol_id, SymbolT const read_symbol) { - ReadSymbol( - character_index, old_state, new_state, symbol_id, read_symbol, cub::Int2Type{}); + ReadSymbol(character_index, + old_state, + new_state, + symbol_id, + read_symbol, + cub::Int2Type{}); } - __host__ __device__ __forceinline__ void TearDown() {} + __device__ __forceinline__ void TearDown() {} public: TransducerTableT const transducer_table; @@ -203,25 +208,44 @@ class DFASWriteCallbackWrapper { uint32_t in_offset; }; -template class WriteCoalescingCallbackWrapper { struct TempStorage_Offsets { - uint16_t compacted_offset[cache_size]; + uint16_t compacted_offset[NumWriteBufferItems]; }; struct TempStorage_Symbols { - out_t compacted_symbols[cache_size]; + OutputT compacted_symbols[NumWriteBufferItems]; }; using offset_cache_t = - ::cuda::std::conditional_t; - using symbol_cache_t = - ::cuda::std::conditional_t, TempStorage_Symbols>; + ::cuda::std::conditional_t; + using symbol_cache_t = ::cuda::std:: + conditional_t, TempStorage_Symbols>; struct TempStorage_ : offset_cache_t, symbol_cache_t {}; __device__ __forceinline__ TempStorage_& PrivateStorage() @@ -267,11 +291,11 @@ class WriteCoalescingCallbackWrapper { { uint32_t const count = transducer_table(old_state, symbol_id, read_symbol); for (uint32_t out_char = 0; out_char < count; out_char++) { - if constexpr (!discard_idx) { + if constexpr (!DisrcardIndexOutput) { temp_storage.compacted_offset[thread_out_offset + out_char - tile_out_offset] = in_offset + character_index - tile_in_offset; } - if constexpr (!discard_out) { + if constexpr (!DiscardTranslatedOutput) { temp_storage.compacted_symbols[thread_out_offset + out_char - tile_out_offset] = transducer_table(old_state, symbol_id, out_char, read_symbol); } @@ -282,12 +306,12 @@ class WriteCoalescingCallbackWrapper { __device__ __forceinline__ void TearDown() { __syncthreads(); - if constexpr (!discard_out) { + if constexpr (!DiscardTranslatedOutput) { for (uint32_t out_char = threadIdx.x; out_char < tile_out_count; out_char += blockDim.x) { out_it[tile_out_offset + out_char] = temp_storage.compacted_symbols[out_char]; } } - if constexpr (!discard_idx) { + if constexpr (!DisrcardIndexOutput) { for (uint32_t out_char = threadIdx.x; out_char < tile_out_count; out_char += blockDim.x) { out_idx_it[tile_out_offset + out_char] = temp_storage.compacted_offset[out_char] + tile_in_offset; @@ -734,13 +758,13 @@ __launch_bounds__(int32_t(AgentDFAPolicy::BLOCK_THREADS)) CUDF_KERNEL using OutSymbolT = typename DfaT::OutSymbolT; // static constexpr int32_t MIN_TRANSLATED_OUT = DfaT::MIN_TRANSLATED_OUT; - static constexpr int32_t MAX_TRANSLATED_OUT = DfaT::MAX_TRANSLATED_OUT; + static constexpr int32_t num_max_translated_out = DfaT::MAX_TRANSLATED_OUT; static constexpr bool discard_out_index = ::cuda::std::is_same>::value; static constexpr bool discard_out_it = ::cuda::std::is_same>::value; using NonWriteCoalescingT = - DFASWriteCallbackWrapper; @@ -748,17 +772,15 @@ __launch_bounds__(int32_t(AgentDFAPolicy::BLOCK_THREADS)) CUDF_KERNEL using WriteCoalescingT = WriteCoalescingCallbackWrapper; - // static constexpr bool is_mapping_fst = (MIN_TRANSLATED_OUT == 1) and (MAX_TRANSLATED_OUT == 1); static constexpr bool is_translation_pass = (!IS_TRANS_VECTOR_PASS) || IS_SINGLE_PASS; - // Use write-coalescing only if it's + // Use write-coalescing only if the worst-case output size per tile fits into shared memory static constexpr bool use_shmem_cache = is_translation_pass and (sizeof(typename WriteCoalescingT::TempStorage) <= 24 * 1024); @@ -850,7 +872,7 @@ __launch_bounds__(int32_t(AgentDFAPolicy::BLOCK_THREADS)) CUDF_KERNEL } // Perform finite-state machine simulation, computing size of transduced output - DFACountCallbackWrapper count_chars_writte_callback_op{transducer_table}; + DFACountCallbackWrapper count_chars_callback_op{transducer_table}; StateIndexT t_start_state = state; agent_dfa.GetThreadStateTransitions(symbol_matcher, @@ -859,7 +881,7 @@ __launch_bounds__(int32_t(AgentDFAPolicy::BLOCK_THREADS)) CUDF_KERNEL blockIdx.x * SYMBOLS_PER_BLOCK, num_chars, state, - count_chars_writte_callback_op, + count_chars_callback_op, cub::Int2Type()); __syncthreads(); @@ -880,7 +902,7 @@ __launch_bounds__(int32_t(AgentDFAPolicy::BLOCK_THREADS)) CUDF_KERNEL if (tile_idx == 0) { OffsetT block_aggregate = 0; OutOffsetBlockScan(scan_temp_storage) - .ExclusiveScan(count_chars_writte_callback_op.out_count, + .ExclusiveScan(count_chars_callback_op.out_count, thread_out_offset, static_cast(0), cub::Sum{}, @@ -898,8 +920,7 @@ __launch_bounds__(int32_t(AgentDFAPolicy::BLOCK_THREADS)) CUDF_KERNEL offset_tile_state, prefix_callback_temp_storage, cub::Sum{}, tile_idx); OutOffsetBlockScan(scan_temp_storage) - .ExclusiveScan( - count_chars_writte_callback_op.out_count, thread_out_offset, cub::Sum{}, prefix_op); + .ExclusiveScan(count_chars_callback_op.out_count, thread_out_offset, cub::Sum{}, prefix_op); tile_out_offset = prefix_op.GetExclusivePrefix(); tile_out_count = prefix_op.GetBlockAggregate(); if (tile_idx == gridDim.x - 1 && threadIdx.x == 0) { diff --git a/cpp/src/io/json/nested_json_gpu.cu b/cpp/src/io/json/nested_json_gpu.cu index e9cb2b7a2b7..1eca15528a3 100644 --- a/cpp/src/io/json/nested_json_gpu.cu +++ b/cpp/src/io/json/nested_json_gpu.cu @@ -1457,11 +1457,12 @@ void get_stack_context(device_span json_in, static constexpr auto min_translated_out = 0; static constexpr auto max_translated_out = 1; - auto json_to_stack_ops_fst = fst::detail::make_fst( + auto json_to_stack_ops_fst = fst::detail::make_fst( fst::detail::make_symbol_group_lut(to_stack_op::get_sgid_lut(delimiter)), fst::detail::make_transition_table(to_stack_op::get_transition_table(stack_behavior)), - fst::detail::make_translation_table( - to_stack_op::get_translation_table(stack_behavior)), + fst::detail:: + make_translation_table( + to_stack_op::get_translation_table(stack_behavior)), stream); // "Search" for relevant occurrence of brackets and braces that indicate the beginning/end From a281c213069a6c9f3b4a4faf4a023f309f00aec6 Mon Sep 17 00:00:00 2001 From: Elias Stehle <3958403+elstehle@users.noreply.github.com> Date: Mon, 1 Jul 2024 05:41:52 -0700 Subject: [PATCH 5/7] fixes copyright --- cpp/tests/io/fst/common.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/tests/io/fst/common.hpp b/cpp/tests/io/fst/common.hpp index 47ef6a1a0de..0177300eda9 100644 --- a/cpp/tests/io/fst/common.hpp +++ b/cpp/tests/io/fst/common.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022-2023, NVIDIA CORPORATION. + * Copyright (c) 2022-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. From 071d520ac0775ab506c7d830980fc042505af440 Mon Sep 17 00:00:00 2001 From: Elias Stehle <3958403+elstehle@users.noreply.github.com> Date: Tue, 9 Jul 2024 21:33:29 -0700 Subject: [PATCH 6/7] fixes style and makes smem threshold a tuning policy parameter --- cpp/src/io/fst/agent_dfa.cuh | 46 +++++++++++++++++++-------------- cpp/src/io/fst/dispatch_dfa.cuh | 5 ++++ 2 files changed, 31 insertions(+), 20 deletions(-) diff --git a/cpp/src/io/fst/agent_dfa.cuh b/cpp/src/io/fst/agent_dfa.cuh index 4b2772fcec8..bc5b94e2718 100644 --- a/cpp/src/io/fst/agent_dfa.cuh +++ b/cpp/src/io/fst/agent_dfa.cuh @@ -111,15 +111,15 @@ template -class DFASWriteCallbackWrapper { +class DFAWriteCallbackWrapper { public: - __device__ __forceinline__ DFASWriteCallbackWrapper(TransducerTableT transducer_table, - TransducedOutItT out_it, - TransducedIndexOutItT out_idx_it, - uint32_t out_offset, - uint32_t /*tile_out_offset*/, - uint32_t /*tile_in_offset*/, - uint32_t /*tile_out_count*/) + __device__ __forceinline__ DFAWriteCallbackWrapper(TransducerTableT transducer_table, + TransducedOutItT out_it, + TransducedIndexOutItT out_idx_it, + uint32_t out_offset, + uint32_t /*tile_out_offset*/, + uint32_t /*tile_in_offset*/, + uint32_t /*tile_out_count*/) : transducer_table(transducer_table), out_it(out_it), out_idx_it(out_idx_it), @@ -215,7 +215,7 @@ class DFASWriteCallbackWrapper { * given output iterators. This class uses a shared memory-backed write buffer to coalesce writes to * global memory. * - * @tparam DisrcardIndexOutput Whether to discard the indexes instead of writing them to the given + * @tparam DiscardIndexOutput Whether to discard the indexes instead of writing them to the given * output iterator * @tparam DiscardTranslatedOutput Whether to discard the translated output symbols instead of * writing them to the given output iterator @@ -228,7 +228,7 @@ class DFASWriteCallbackWrapper { * transducer table are assignable. * @tparam TransducedIndexOutItT A random-access output iterator type to which indexes are written. */ -template ; + ::cuda::std::conditional_t; using symbol_cache_t = ::cuda::std:: conditional_t, TempStorage_Symbols>; struct TempStorage_ : offset_cache_t, symbol_cache_t {}; @@ -291,7 +291,7 @@ class WriteCoalescingCallbackWrapper { { uint32_t const count = transducer_table(old_state, symbol_id, read_symbol); for (uint32_t out_char = 0; out_char < count; out_char++) { - if constexpr (!DisrcardIndexOutput) { + if constexpr (!DiscardIndexOutput) { temp_storage.compacted_offset[thread_out_offset + out_char - tile_out_offset] = in_offset + character_index - tile_in_offset; } @@ -311,7 +311,7 @@ class WriteCoalescingCallbackWrapper { out_it[tile_out_offset + out_char] = temp_storage.compacted_symbols[out_char]; } } - if constexpr (!DisrcardIndexOutput) { + if constexpr (!DiscardIndexOutput) { for (uint32_t out_char = threadIdx.x; out_char < tile_out_count; out_char += blockDim.x) { out_idx_it[tile_out_offset + out_char] = temp_storage.compacted_offset[out_char] + tile_in_offset; @@ -764,10 +764,10 @@ __launch_bounds__(int32_t(AgentDFAPolicy::BLOCK_THREADS)) CUDF_KERNEL static constexpr bool discard_out_it = ::cuda::std::is_same>::value; using NonWriteCoalescingT = - DFASWriteCallbackWrapper; + DFAWriteCallbackWrapper; using WriteCoalescingT = WriteCoalescingCallbackWrapper::Type; + typename cub::If::Type; // Stage 1: Compute the state-transition vector if (IS_TRANS_VECTOR_PASS || IS_SINGLE_PASS) { diff --git a/cpp/src/io/fst/dispatch_dfa.cuh b/cpp/src/io/fst/dispatch_dfa.cuh index 1fb86465ef8..ef5e9c8a78f 100644 --- a/cpp/src/io/fst/dispatch_dfa.cuh +++ b/cpp/src/io/fst/dispatch_dfa.cuh @@ -37,6 +37,11 @@ struct AgentDFAPolicy { // The number of symbols processed by each thread static constexpr int32_t ITEMS_PER_THREAD = _ITEMS_PER_THREAD; + + // If the shared memory-backed write buffer exceeds this threshold, the FST will skip buffering + // the output in a write buffer and instead immediately write out to global memory, potentially + // resulting in non-coalesced writes + static constexpr std::size_t SMEM_THRESHOLD = 24 * 1024; }; /** From 3d666eb9439e3bca6809638e76b5a14579acc65f Mon Sep 17 00:00:00 2001 From: Elias Stehle <3958403+elstehle@users.noreply.github.com> Date: Wed, 17 Jul 2024 00:58:04 -0700 Subject: [PATCH 7/7] docs fix --- cpp/src/io/fst/lookup_tables.cuh | 2 ++ 1 file changed, 2 insertions(+) diff --git a/cpp/src/io/fst/lookup_tables.cuh b/cpp/src/io/fst/lookup_tables.cuh index cce41c395a4..ae1f81fd541 100644 --- a/cpp/src/io/fst/lookup_tables.cuh +++ b/cpp/src/io/fst/lookup_tables.cuh @@ -802,6 +802,8 @@ class TranslationOp { * * @tparam FunctorT A function object type that must implement two signatures: (1) with `(state_id, * match_id, read_symbol)` and (2) with `(state_id, match_id, relative_offset, read_symbol)` + * @tparam MIN_TRANSLATED_SYMBOLS The minimum number of translated output symbols for any given + * input symbol * @tparam MAX_TRANSLATED_SYMBOLS The maximum number of translated output symbols for any given * input symbol * @param map_op A function object that must implement two signatures: (1) with `(state_id,