From 3fa94f442464e55e68a0eaeb30782f57629aed4d Mon Sep 17 00:00:00 2001 From: "q.yao" Date: Mon, 12 Jul 2021 10:01:52 +0800 Subject: [PATCH] [Feature] better tensorrt cpp code (#11) * better tensorrt cpp code * fix end of file --- .../{gridSample.cpp => grid_sample.cpp} | 0 .../tensorrt/batched_nms/trt_batched_nms.cpp | 99 ++++----- .../tensorrt/batched_nms/trt_batched_nms.hpp | 53 +---- ...ference.cpp => trt_batched_nms_kernel.cpp} | 3 +- .../batched_nms/trt_batched_nms_kernel.hpp | 16 ++ .../tensorrt/common/common_cuda_helper.hpp | 109 ++++------ .../tensorrt/common/trt_cuda_helper.cuh | 30 --- .../tensorrt/common/trt_plugin_base.hpp | 55 +++++ .../tensorrt/common/trt_plugin_helper.hpp | 1 + backend_ops/tensorrt/common/trt_serialize.hpp | 4 - .../tensorrt/common_impl/trt_cuda_helper.cu | 1 - .../trt_multi_level_roi_align.cpp | 109 ++++------ .../trt_multi_level_roi_align.hpp | 58 ++--- .../trt_multi_level_roi_align_kernel.cu | 48 ----- backend_ops/tensorrt/nms/trt_nms.cpp | 152 +++++-------- backend_ops/tensorrt/nms/trt_nms.hpp | 51 +---- backend_ops/tensorrt/nms/trt_nms_kernel.cu | 80 ++++++- backend_ops/tensorrt/nms/trt_nms_kernel.cuh | 67 ------ backend_ops/tensorrt/nms/trt_nms_kernel.hpp | 18 ++ .../tensorrt/roi_align/trt_roi_align.cpp | 140 +++++------- .../tensorrt/roi_align/trt_roi_align.hpp | 51 +---- .../roi_align/trt_roi_align_kernel.cu | 104 ++++++++- .../roi_align/trt_roi_align_kernel.cuh | 204 ------------------ .../roi_align/trt_roi_align_kernel.hpp | 14 ++ .../tensorrt/scatternd/trt_scatternd.cpp | 101 +++------ .../tensorrt/scatternd/trt_scatternd.hpp | 47 +--- .../scatternd/trt_scatternd_kernel.cu | 1 - 27 files changed, 573 insertions(+), 1043 deletions(-) rename backend_ops/onnxruntime/grid_sample/{gridSample.cpp => grid_sample.cpp} (100%) rename backend_ops/tensorrt/batched_nms/{batchedNMSInference.cpp => trt_batched_nms_kernel.cpp} (99%) create mode 100644 backend_ops/tensorrt/batched_nms/trt_batched_nms_kernel.hpp delete mode 100644 backend_ops/tensorrt/common/trt_cuda_helper.cuh create mode 100644 backend_ops/tensorrt/common/trt_plugin_base.hpp delete mode 100644 backend_ops/tensorrt/nms/trt_nms_kernel.cuh create mode 100644 backend_ops/tensorrt/nms/trt_nms_kernel.hpp delete mode 100644 backend_ops/tensorrt/roi_align/trt_roi_align_kernel.cuh create mode 100644 backend_ops/tensorrt/roi_align/trt_roi_align_kernel.hpp diff --git a/backend_ops/onnxruntime/grid_sample/gridSample.cpp b/backend_ops/onnxruntime/grid_sample/grid_sample.cpp similarity index 100% rename from backend_ops/onnxruntime/grid_sample/gridSample.cpp rename to backend_ops/onnxruntime/grid_sample/grid_sample.cpp diff --git a/backend_ops/tensorrt/batched_nms/trt_batched_nms.cpp b/backend_ops/tensorrt/batched_nms/trt_batched_nms.cpp index bdde30d6de..e7f39c5942 100644 --- a/backend_ops/tensorrt/batched_nms/trt_batched_nms.cpp +++ b/backend_ops/tensorrt/batched_nms/trt_batched_nms.cpp @@ -6,8 +6,10 @@ #include #include "kernel.h" +#include "trt_batched_nms_kernel.hpp" #include "trt_serialize.hpp" +namespace mmlab { using namespace nvinfer1; using nvinfer1::plugin::NMSParameters; @@ -16,11 +18,12 @@ static const char* NMS_PLUGIN_VERSION{"1"}; static const char* NMS_PLUGIN_NAME{"TRTBatchedNMS"}; } // namespace -TRTBatchedNMSPluginDynamic::TRTBatchedNMSPluginDynamic(NMSParameters params) - : param(params) {} +TRTBatchedNMS::TRTBatchedNMS(const std::string& name, NMSParameters params) + : TRTPluginBase(name), param(params) {} -TRTBatchedNMSPluginDynamic::TRTBatchedNMSPluginDynamic(const void* data, - size_t length) { +TRTBatchedNMS::TRTBatchedNMS(const std::string& name, const void* data, + size_t length) + : TRTPluginBase(name) { deserialize_value(&data, &length, ¶m); deserialize_value(&data, &length, &boxesSize); deserialize_value(&data, &length, &scoresSize); @@ -28,13 +31,9 @@ TRTBatchedNMSPluginDynamic::TRTBatchedNMSPluginDynamic(const void* data, deserialize_value(&data, &length, &mClipBoxes); } -int TRTBatchedNMSPluginDynamic::getNbOutputs() const { return 2; } +int TRTBatchedNMS::getNbOutputs() const { return 2; } -int TRTBatchedNMSPluginDynamic::initialize() { return STATUS_SUCCESS; } - -void TRTBatchedNMSPluginDynamic::terminate() {} - -nvinfer1::DimsExprs TRTBatchedNMSPluginDynamic::getOutputDimensions( +nvinfer1::DimsExprs TRTBatchedNMS::getOutputDimensions( int outputIndex, const nvinfer1::DimsExprs* inputs, int nbInputs, nvinfer1::IExprBuilder& exprBuilder) { ASSERT(nbInputs == 2); @@ -60,7 +59,7 @@ nvinfer1::DimsExprs TRTBatchedNMSPluginDynamic::getOutputDimensions( return ret; } -size_t TRTBatchedNMSPluginDynamic::getWorkspaceSize( +size_t TRTBatchedNMS::getWorkspaceSize( const nvinfer1::PluginTensorDesc* inputs, int nbInputs, const nvinfer1::PluginTensorDesc* outputs, int nbOutputs) const { size_t batch_size = inputs[0].dims.d[0]; @@ -75,10 +74,10 @@ size_t TRTBatchedNMSPluginDynamic::getWorkspaceSize( num_priors, topk, DataType::kFLOAT, DataType::kFLOAT); } -int TRTBatchedNMSPluginDynamic::enqueue( - const nvinfer1::PluginTensorDesc* inputDesc, - const nvinfer1::PluginTensorDesc* outputDesc, const void* const* inputs, - void* const* outputs, void* workSpace, cudaStream_t stream) { +int TRTBatchedNMS::enqueue(const nvinfer1::PluginTensorDesc* inputDesc, + const nvinfer1::PluginTensorDesc* outputDesc, + const void* const* inputs, void* const* outputs, + void* workSpace, cudaStream_t stream) { const void* const locData = inputs[0]; const void* const confData = inputs[1]; @@ -103,12 +102,12 @@ int TRTBatchedNMSPluginDynamic::enqueue( return 0; } -size_t TRTBatchedNMSPluginDynamic::getSerializationSize() const { +size_t TRTBatchedNMS::getSerializationSize() const { // NMSParameters, boxesSize,scoresSize,numPriors return sizeof(NMSParameters) + sizeof(int) * 3 + sizeof(bool); } -void TRTBatchedNMSPluginDynamic::serialize(void* buffer) const { +void TRTBatchedNMS::serialize(void* buffer) const { serialize_value(&buffer, param); serialize_value(&buffer, boxesSize); serialize_value(&buffer, scoresSize); @@ -116,13 +115,13 @@ void TRTBatchedNMSPluginDynamic::serialize(void* buffer) const { serialize_value(&buffer, mClipBoxes); } -void TRTBatchedNMSPluginDynamic::configurePlugin( +void TRTBatchedNMS::configurePlugin( const nvinfer1::DynamicPluginTensorDesc* inputs, int nbInputs, const nvinfer1::DynamicPluginTensorDesc* outputs, int nbOutputs) { // Validate input arguments } -bool TRTBatchedNMSPluginDynamic::supportsFormatCombination( +bool TRTBatchedNMS::supportsFormatCombination( int pos, const nvinfer1::PluginTensorDesc* inOut, int nbInputs, int nbOutputs) { if (pos == 3) { @@ -133,18 +132,14 @@ bool TRTBatchedNMSPluginDynamic::supportsFormatCombination( inOut[pos].format == nvinfer1::TensorFormat::kLINEAR; } -const char* TRTBatchedNMSPluginDynamic::getPluginType() const { - return NMS_PLUGIN_NAME; -} +const char* TRTBatchedNMS::getPluginType() const { return NMS_PLUGIN_NAME; } -const char* TRTBatchedNMSPluginDynamic::getPluginVersion() const { +const char* TRTBatchedNMS::getPluginVersion() const { return NMS_PLUGIN_VERSION; } -void TRTBatchedNMSPluginDynamic::destroy() { delete this; } - -IPluginV2DynamicExt* TRTBatchedNMSPluginDynamic::clone() const { - auto* plugin = new TRTBatchedNMSPluginDynamic(param); +IPluginV2DynamicExt* TRTBatchedNMS::clone() const { + auto* plugin = new TRTBatchedNMS(mLayerName, param); plugin->boxesSize = boxesSize; plugin->scoresSize = scoresSize; plugin->numPriors = numPriors; @@ -153,16 +148,7 @@ IPluginV2DynamicExt* TRTBatchedNMSPluginDynamic::clone() const { return plugin; } -void TRTBatchedNMSPluginDynamic::setPluginNamespace( - const char* pluginNamespace) { - mNamespace = pluginNamespace; -} - -const char* TRTBatchedNMSPluginDynamic::getPluginNamespace() const { - return mNamespace.c_str(); -} - -nvinfer1::DataType TRTBatchedNMSPluginDynamic::getOutputDataType( +nvinfer1::DataType TRTBatchedNMS::getOutputDataType( int index, const nvinfer1::DataType* inputTypes, int nbInputs) const { ASSERT(index >= 0 && index < this->getNbOutputs()); if (index == 1) { @@ -171,10 +157,9 @@ nvinfer1::DataType TRTBatchedNMSPluginDynamic::getOutputDataType( return inputTypes[0]; } -void TRTBatchedNMSPluginDynamic::setClipParam(bool clip) { mClipBoxes = clip; } +void TRTBatchedNMS::setClipParam(bool clip) { mClipBoxes = clip; } -TRTBatchedNMSPluginDynamicCreator::TRTBatchedNMSPluginDynamicCreator() - : params{} { +TRTBatchedNMSCreator::TRTBatchedNMSCreator() { mPluginAttributes.emplace_back( PluginField("background_label_id", nullptr, PluginFieldType::kINT32, 1)); mPluginAttributes.emplace_back( @@ -196,23 +181,19 @@ TRTBatchedNMSPluginDynamicCreator::TRTBatchedNMSPluginDynamicCreator() mFC.fields = mPluginAttributes.data(); } -const char* TRTBatchedNMSPluginDynamicCreator::getPluginName() const { +const char* TRTBatchedNMSCreator::getPluginName() const { return NMS_PLUGIN_NAME; } -const char* TRTBatchedNMSPluginDynamicCreator::getPluginVersion() const { +const char* TRTBatchedNMSCreator::getPluginVersion() const { return NMS_PLUGIN_VERSION; } -const PluginFieldCollection* -TRTBatchedNMSPluginDynamicCreator::getFieldNames() { - return &mFC; -} - -IPluginV2Ext* TRTBatchedNMSPluginDynamicCreator::createPlugin( +IPluginV2Ext* TRTBatchedNMSCreator::createPlugin( const char* name, const PluginFieldCollection* fc) { const PluginField* fields = fc->fields; bool clipBoxes = true; + nvinfer1::plugin::NMSParameters params{}; for (int i = 0; i < fc->nbFields; ++i) { const char* attrName = fields[i].name; @@ -241,29 +222,21 @@ IPluginV2Ext* TRTBatchedNMSPluginDynamicCreator::createPlugin( } } - TRTBatchedNMSPluginDynamic* plugin = new TRTBatchedNMSPluginDynamic(params); + TRTBatchedNMS* plugin = new TRTBatchedNMS(name, params); plugin->setClipParam(clipBoxes); plugin->setPluginNamespace(mNamespace.c_str()); return plugin; } -IPluginV2Ext* TRTBatchedNMSPluginDynamicCreator::deserializePlugin( - const char* name, const void* serialData, size_t serialLength) { +IPluginV2Ext* TRTBatchedNMSCreator::deserializePlugin(const char* name, + const void* serialData, + size_t serialLength) { // This object will be deleted when the network is destroyed, which will // call NMS::destroy() - TRTBatchedNMSPluginDynamic* plugin = - new TRTBatchedNMSPluginDynamic(serialData, serialLength); + TRTBatchedNMS* plugin = new TRTBatchedNMS(name, serialData, serialLength); plugin->setPluginNamespace(mNamespace.c_str()); return plugin; } -void TRTBatchedNMSPluginDynamicCreator::setPluginNamespace( - const char* libNamespace) { - mNamespace = libNamespace; -} - -const char* TRTBatchedNMSPluginDynamicCreator::getPluginNamespace() const { - return mNamespace.c_str(); -} - -REGISTER_TENSORRT_PLUGIN(TRTBatchedNMSPluginDynamicCreator); +REGISTER_TENSORRT_PLUGIN(TRTBatchedNMSCreator); +} // namespace mmlab diff --git a/backend_ops/tensorrt/batched_nms/trt_batched_nms.hpp b/backend_ops/tensorrt/batched_nms/trt_batched_nms.hpp index 015d62725f..cb374e7c31 100644 --- a/backend_ops/tensorrt/batched_nms/trt_batched_nms.hpp +++ b/backend_ops/tensorrt/batched_nms/trt_batched_nms.hpp @@ -5,15 +5,15 @@ #include #include -#include "trt_plugin_helper.hpp" - -class TRTBatchedNMSPluginDynamic : public nvinfer1::IPluginV2DynamicExt { +#include "trt_plugin_base.hpp" +namespace mmlab { +class TRTBatchedNMS : public TRTPluginBase { public: - TRTBatchedNMSPluginDynamic(nvinfer1::plugin::NMSParameters param); + TRTBatchedNMS(const std::string& name, nvinfer1::plugin::NMSParameters param); - TRTBatchedNMSPluginDynamic(const void* data, size_t length); + TRTBatchedNMS(const std::string& name, const void* data, size_t length); - ~TRTBatchedNMSPluginDynamic() override = default; + ~TRTBatchedNMS() override = default; int getNbOutputs() const override; @@ -21,10 +21,6 @@ class TRTBatchedNMSPluginDynamic : public nvinfer1::IPluginV2DynamicExt { int outputIndex, const nvinfer1::DimsExprs* inputs, int nbInputs, nvinfer1::IExprBuilder& exprBuilder) override; - int initialize() override; - - void terminate() override; - size_t getWorkspaceSize(const nvinfer1::PluginTensorDesc* inputs, int nbInputs, const nvinfer1::PluginTensorDesc* outputs, @@ -52,18 +48,12 @@ class TRTBatchedNMSPluginDynamic : public nvinfer1::IPluginV2DynamicExt { const char* getPluginVersion() const override; - void destroy() override; - nvinfer1::IPluginV2DynamicExt* clone() const override; nvinfer1::DataType getOutputDataType(int index, const nvinfer1::DataType* inputType, int nbInputs) const override; - void setPluginNamespace(const char* libNamespace) override; - - const char* getPluginNamespace() const override; - void setClipParam(bool clip); private: @@ -71,48 +61,25 @@ class TRTBatchedNMSPluginDynamic : public nvinfer1::IPluginV2DynamicExt { int boxesSize{}; int scoresSize{}; int numPriors{}; - std::string mNamespace; bool mClipBoxes{}; - - protected: - // To prevent compiler warnings. - using nvinfer1::IPluginV2DynamicExt::canBroadcastInputAcrossBatch; - using nvinfer1::IPluginV2DynamicExt::configurePlugin; - using nvinfer1::IPluginV2DynamicExt::enqueue; - using nvinfer1::IPluginV2DynamicExt::getOutputDimensions; - using nvinfer1::IPluginV2DynamicExt::getWorkspaceSize; - using nvinfer1::IPluginV2DynamicExt::isOutputBroadcastAcrossBatch; - using nvinfer1::IPluginV2DynamicExt::supportsFormat; }; -class TRTBatchedNMSPluginDynamicCreator : public nvinfer1::IPluginCreator { +class TRTBatchedNMSCreator : public TRTPluginCreatorBase { public: - TRTBatchedNMSPluginDynamicCreator(); + TRTBatchedNMSCreator(); - ~TRTBatchedNMSPluginDynamicCreator() override = default; + ~TRTBatchedNMSCreator() override = default; const char* getPluginName() const override; const char* getPluginVersion() const override; - const nvinfer1::PluginFieldCollection* getFieldNames() override; - nvinfer1::IPluginV2Ext* createPlugin( const char* name, const nvinfer1::PluginFieldCollection* fc) override; nvinfer1::IPluginV2Ext* deserializePlugin(const char* name, const void* serialData, size_t serialLength) override; - - void setPluginNamespace(const char* libNamespace) override; - - const char* getPluginNamespace() const override; - - private: - nvinfer1::PluginFieldCollection mFC; - nvinfer1::plugin::NMSParameters params; - std::vector mPluginAttributes; - std::string mNamespace; }; - +} // namespace mmlab #endif // TRT_BATCHED_NMS_PLUGIN_CUSTOM_H diff --git a/backend_ops/tensorrt/batched_nms/batchedNMSInference.cpp b/backend_ops/tensorrt/batched_nms/trt_batched_nms_kernel.cpp similarity index 99% rename from backend_ops/tensorrt/batched_nms/batchedNMSInference.cpp rename to backend_ops/tensorrt/batched_nms/trt_batched_nms_kernel.cpp index bbdc42626f..0be033b437 100644 --- a/backend_ops/tensorrt/batched_nms/batchedNMSInference.cpp +++ b/backend_ops/tensorrt/batched_nms/trt_batched_nms_kernel.cpp @@ -1,7 +1,6 @@ // modify from // https://github.com/NVIDIA/TensorRT/tree/master/plugin/batchedNMSPlugin -#include "cuda_runtime_api.h" -#include "kernel.h" +#include "trt_batched_nms_kernel.hpp" pluginStatus_t nmsInference( cudaStream_t stream, const int N, const int perBatchBoxesSize, diff --git a/backend_ops/tensorrt/batched_nms/trt_batched_nms_kernel.hpp b/backend_ops/tensorrt/batched_nms/trt_batched_nms_kernel.hpp new file mode 100644 index 0000000000..133a212743 --- /dev/null +++ b/backend_ops/tensorrt/batched_nms/trt_batched_nms_kernel.hpp @@ -0,0 +1,16 @@ +#ifndef TRT_BATCHED_NMS_KERNEL_HPP +#define TRT_BATCHED_NMS_KERNEL_HPP +#include "cuda_runtime_api.h" +#include "kernel.h" + +pluginStatus_t nmsInference( + cudaStream_t stream, const int N, const int perBatchBoxesSize, + const int perBatchScoresSize, const bool shareLocation, + const int backgroundLabelId, const int numPredsPerClass, + const int numClasses, const int topK, const int keepTopK, + const float scoreThreshold, const float iouThreshold, + const DataType DT_BBOX, const void* locData, const DataType DT_SCORE, + const void* confData, void* nmsedDets, void* nmsedLabels, void* workspace, + bool isNormalized, bool confSigmoid, bool clipBoxes); + +#endif diff --git a/backend_ops/tensorrt/common/common_cuda_helper.hpp b/backend_ops/tensorrt/common/common_cuda_helper.hpp index a9ab6e82f1..be21d7fccf 100644 --- a/backend_ops/tensorrt/common/common_cuda_helper.hpp +++ b/backend_ops/tensorrt/common/common_cuda_helper.hpp @@ -3,22 +3,49 @@ #include +#include + #define CUDA_1D_KERNEL_LOOP(i, n) \ for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \ i += blockDim.x * gridDim.x) #define THREADS_PER_BLOCK 512 +#define DIVUP(m, n) ((m) / (n) + ((m) % (n) > 0)) inline int GET_BLOCKS(const int N) { - int optimal_block_num = (N + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK; + int optimal_block_num = DIVUP(N, THREADS_PER_BLOCK); int max_block_num = 4096; - return min(optimal_block_num, max_block_num); + return std::min(optimal_block_num, max_block_num); } -template -__device__ T bilinear_interpolate(const T* input, const int height, - const int width, T y, T x, - const int index /* index for debug only*/) { +#define cudaCheckError() \ + { \ + cudaError_t e = cudaGetLastError(); \ + if (e != cudaSuccess) { \ + printf("Cuda failure %s:%d: '%s'\n", __FILE__, __LINE__, \ + cudaGetErrorString(e)); \ + exit(0); \ + } \ + } + +/** + * Returns a view of the original tensor with its dimensions permuted. + * + * @param[out] dst pointer to the destination tensor + * @param[in] src pointer to the source tensor + * @param[in] src_size shape of the src tensor + * @param[in] permute The desired ordering of dimensions + * @param[in] src_dim dim of src tensor + * @param[in] stream cuda stream handle + */ +template +void memcpyPermute(scalar_t *dst, const scalar_t *src, int *src_size, + int *permute, int src_dim, cudaStream_t stream = 0); + +template +__device__ scalar_t bilinear_interpolate(const scalar_t *input, + const int height, const int width, + scalar_t y, scalar_t x) { // deal with cases that inverse elements are out of feature map boundary if (y < -1.0 || y > height || x < -1.0 || x > width) return 0; @@ -32,79 +59,31 @@ __device__ T bilinear_interpolate(const T* input, const int height, if (y_low >= height - 1) { y_high = y_low = height - 1; - y = (T)y_low; + y = (scalar_t)y_low; } else { y_high = y_low + 1; } if (x_low >= width - 1) { x_high = x_low = width - 1; - x = (T)x_low; + x = (scalar_t)x_low; } else { x_high = x_low + 1; } - T ly = y - y_low; - T lx = x - x_low; - T hy = 1. - ly, hx = 1. - lx; + scalar_t ly = y - y_low; + scalar_t lx = x - x_low; + scalar_t hy = 1. - ly, hx = 1. - lx; // do bilinear interpolation - T v1 = input[y_low * width + x_low]; - T v2 = input[y_low * width + x_high]; - T v3 = input[y_high * width + x_low]; - T v4 = input[y_high * width + x_high]; - T w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx; + scalar_t v1 = input[y_low * width + x_low]; + scalar_t v2 = input[y_low * width + x_high]; + scalar_t v3 = input[y_high * width + x_low]; + scalar_t v4 = input[y_high * width + x_high]; + scalar_t w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx; - T val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4); + scalar_t val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4); return val; } -template -__device__ void bilinear_interpolate_gradient( - const int height, const int width, T y, T x, T& w1, T& w2, T& w3, T& w4, - int& x_low, int& x_high, int& y_low, int& y_high, - const int index /* index for debug only*/) { - // deal with cases that inverse elements are out of feature map boundary - if (y < -1.0 || y > height || x < -1.0 || x > width) { - // empty - w1 = w2 = w3 = w4 = 0.; - x_low = x_high = y_low = y_high = -1; - return; - } - - if (y <= 0) y = 0; - if (x <= 0) x = 0; - - y_low = (int)y; - x_low = (int)x; - - if (y_low >= height - 1) { - y_high = y_low = height - 1; - y = (T)y_low; - } else { - y_high = y_low + 1; - } - - if (x_low >= width - 1) { - x_high = x_low = width - 1; - x = (T)x_low; - } else { - x_high = x_low + 1; - } - - T ly = y - y_low; - T lx = x - x_low; - T hy = 1. - ly, hx = 1. - lx; - - // reference in forward - // T v1 = input[y_low * width + x_low]; - // T v2 = input[y_low * width + x_high]; - // T v3 = input[y_high * width + x_low]; - // T v4 = input[y_high * width + x_high]; - // T val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4); - - w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx; - - return; -} #endif // COMMON_CUDA_HELPER diff --git a/backend_ops/tensorrt/common/trt_cuda_helper.cuh b/backend_ops/tensorrt/common/trt_cuda_helper.cuh deleted file mode 100644 index a4635dcdd5..0000000000 --- a/backend_ops/tensorrt/common/trt_cuda_helper.cuh +++ /dev/null @@ -1,30 +0,0 @@ -#ifndef TRT_CUDA_HELPER_HPP -#define TRT_CUDA_HELPER_HPP - -#define DIVUP(m, n) ((m) / (n) + ((m) % (n) > 0)) - -#define cudaCheckError() \ - { \ - cudaError_t e = cudaGetLastError(); \ - if (e != cudaSuccess) { \ - printf("Cuda failure %s:%d: '%s'\n", __FILE__, __LINE__, \ - cudaGetErrorString(e)); \ - exit(0); \ - } \ - } - -/** - * Returns a view of the original tensor with its dimensions permuted. - * - * @param[out] dst pointer to the destination tensor - * @param[in] src pointer to the source tensor - * @param[in] src_size shape of the src tensor - * @param[in] permute The desired ordering of dimensions - * @param[in] src_dim dim of src tensor - * @param[in] stream cuda stream handle - */ -template -void memcpyPermute(scalar_t *dst, const scalar_t *src, int *src_size, - int *permute, int src_dim, cudaStream_t stream = 0); - -#endif // TRT_CUDA_HELPER_HPP diff --git a/backend_ops/tensorrt/common/trt_plugin_base.hpp b/backend_ops/tensorrt/common/trt_plugin_base.hpp new file mode 100644 index 0000000000..986ca2b41e --- /dev/null +++ b/backend_ops/tensorrt/common/trt_plugin_base.hpp @@ -0,0 +1,55 @@ +#ifndef TRT_PLUGIN_BASE_HPP +#define TRT_PLUGIN_BASE_HPP +#include "NvInferPlugin.h" +#include "trt_plugin_helper.hpp" + +namespace mmlab { +class TRTPluginBase : public nvinfer1::IPluginV2DynamicExt { + public: + TRTPluginBase(const std::string &name) : mLayerName(name) {} + // IPluginV2 Methods + const char *getPluginVersion() const override { return "1"; } + int initialize() override { return STATUS_SUCCESS; } + void terminate() override {} + void destroy() override { delete this; } + void setPluginNamespace(const char *pluginNamespace) override { + mNamespace = pluginNamespace; + } + const char *getPluginNamespace() const override { return mNamespace.c_str(); } + + protected: + const std::string mLayerName; + std::string mNamespace; + + protected: + // To prevent compiler warnings. + using nvinfer1::IPluginV2DynamicExt::canBroadcastInputAcrossBatch; + using nvinfer1::IPluginV2DynamicExt::configurePlugin; + using nvinfer1::IPluginV2DynamicExt::enqueue; + using nvinfer1::IPluginV2DynamicExt::getOutputDimensions; + using nvinfer1::IPluginV2DynamicExt::getWorkspaceSize; + using nvinfer1::IPluginV2DynamicExt::isOutputBroadcastAcrossBatch; + using nvinfer1::IPluginV2DynamicExt::supportsFormat; +}; + +class TRTPluginCreatorBase : public nvinfer1::IPluginCreator { + public: + const char *getPluginVersion() const override { return "1"; }; + + const nvinfer1::PluginFieldCollection *getFieldNames() override { + return &mFC; + } + + void setPluginNamespace(const char *pluginNamespace) override { + mNamespace = pluginNamespace; + } + + const char *getPluginNamespace() const override { return mNamespace.c_str(); } + + protected: + nvinfer1::PluginFieldCollection mFC; + std::vector mPluginAttributes; + std::string mNamespace; +}; +} // namespace mmlab +#endif diff --git a/backend_ops/tensorrt/common/trt_plugin_helper.hpp b/backend_ops/tensorrt/common/trt_plugin_helper.hpp index e8b019293a..2c8e3919bc 100644 --- a/backend_ops/tensorrt/common/trt_plugin_helper.hpp +++ b/backend_ops/tensorrt/common/trt_plugin_helper.hpp @@ -1,5 +1,6 @@ #ifndef TRT_PLUGIN_HELPER_HPP #define TRT_PLUGIN_HELPER_HPP +#include #include #include "NvInferPlugin.h" diff --git a/backend_ops/tensorrt/common/trt_serialize.hpp b/backend_ops/tensorrt/common/trt_serialize.hpp index 1f0899fdfe..dfbf735a7d 100644 --- a/backend_ops/tensorrt/common/trt_serialize.hpp +++ b/backend_ops/tensorrt/common/trt_serialize.hpp @@ -5,12 +5,8 @@ #define TRT_SERIALIZE_HPP #include #include -#include #include #include -using std::cerr; -using std::cout; -using std::endl; template inline void serialize_value(void** buffer, T const& value); diff --git a/backend_ops/tensorrt/common_impl/trt_cuda_helper.cu b/backend_ops/tensorrt/common_impl/trt_cuda_helper.cu index 3aa7014ffc..b4f673e501 100644 --- a/backend_ops/tensorrt/common_impl/trt_cuda_helper.cu +++ b/backend_ops/tensorrt/common_impl/trt_cuda_helper.cu @@ -1,5 +1,4 @@ #include "common_cuda_helper.hpp" -#include "trt_cuda_helper.cuh" #include "trt_plugin_helper.hpp" using mmlab::TensorDesc; diff --git a/backend_ops/tensorrt/multi_level_roi_align/trt_multi_level_roi_align.cpp b/backend_ops/tensorrt/multi_level_roi_align/trt_multi_level_roi_align.cpp index 8aae0ed3f1..cbf3eb7122 100644 --- a/backend_ops/tensorrt/multi_level_roi_align/trt_multi_level_roi_align.cpp +++ b/backend_ops/tensorrt/multi_level_roi_align/trt_multi_level_roi_align.cpp @@ -6,18 +6,19 @@ #include #include "trt_multi_level_roi_align_kernel.hpp" +#include "trt_plugin_helper.hpp" #include "trt_serialize.hpp" - +namespace mmlab { namespace { static const char *PLUGIN_VERSION{"1"}; static const char *PLUGIN_NAME{"MMCVMultiLevelRoiAlign"}; } // namespace -MultiLevelRoiAlignPluginDynamic::MultiLevelRoiAlignPluginDynamic( +TRTMultiLevelRoiAlign::TRTMultiLevelRoiAlign( const std::string &name, int alignedHeight, int alignedWidth, int sampleNum, const std::vector &featmapStrides, float roiScaleFactor, int finestScale, bool aligned) - : mLayerName(name), + : TRTPluginBase(name), mAlignedHeight(alignedHeight), mAlignedWidth(alignedWidth), mSampleNum(sampleNum), @@ -26,9 +27,9 @@ MultiLevelRoiAlignPluginDynamic::MultiLevelRoiAlignPluginDynamic( mFinestScale(finestScale), mAligned(aligned) {} -MultiLevelRoiAlignPluginDynamic::MultiLevelRoiAlignPluginDynamic( - const std::string name, const void *data, size_t length) - : mLayerName(name) { +TRTMultiLevelRoiAlign::TRTMultiLevelRoiAlign(const std::string name, + const void *data, size_t length) + : TRTPluginBase(name) { deserialize_value(&data, &length, &mAlignedHeight); deserialize_value(&data, &length, &mAlignedWidth); deserialize_value(&data, &length, &mSampleNum); @@ -38,8 +39,8 @@ MultiLevelRoiAlignPluginDynamic::MultiLevelRoiAlignPluginDynamic( deserialize_value(&data, &length, &mFeatmapStrides); } -nvinfer1::IPluginV2DynamicExt *MultiLevelRoiAlignPluginDynamic::clone() const { - MultiLevelRoiAlignPluginDynamic *plugin = new MultiLevelRoiAlignPluginDynamic( +nvinfer1::IPluginV2DynamicExt *TRTMultiLevelRoiAlign::clone() const { + TRTMultiLevelRoiAlign *plugin = new TRTMultiLevelRoiAlign( mLayerName, mAlignedHeight, mAlignedWidth, mSampleNum, mFeatmapStrides, mRoiScaleFactor, mFinestScale, mAligned); plugin->setPluginNamespace(getPluginNamespace()); @@ -47,10 +48,10 @@ nvinfer1::IPluginV2DynamicExt *MultiLevelRoiAlignPluginDynamic::clone() const { return plugin; } -nvinfer1::DimsExprs MultiLevelRoiAlignPluginDynamic::getOutputDimensions( +nvinfer1::DimsExprs TRTMultiLevelRoiAlign::getOutputDimensions( int outputIndex, const nvinfer1::DimsExprs *inputs, int nbInputs, nvinfer1::IExprBuilder &exprBuilder) { - assert(nbInputs == mFeatmapStrides.size() + 1); + ASSERT(nbInputs == mFeatmapStrides.size() + 1); nvinfer1::DimsExprs ret; ret.nbDims = 4; @@ -62,33 +63,32 @@ nvinfer1::DimsExprs MultiLevelRoiAlignPluginDynamic::getOutputDimensions( return ret; } -bool MultiLevelRoiAlignPluginDynamic::supportsFormatCombination( +bool TRTMultiLevelRoiAlign::supportsFormatCombination( int pos, const nvinfer1::PluginTensorDesc *inOut, int nbInputs, int nbOutputs) { - const auto *in = inOut; - const auto *out = inOut + nbInputs; return inOut[pos].type == nvinfer1::DataType::kFLOAT && inOut[pos].format == nvinfer1::TensorFormat::kLINEAR; } -void MultiLevelRoiAlignPluginDynamic::configurePlugin( +void TRTMultiLevelRoiAlign::configurePlugin( const nvinfer1::DynamicPluginTensorDesc *inputs, int nbInputs, const nvinfer1::DynamicPluginTensorDesc *outputs, int nbOutputs) { // Validate input arguments - assert(nbOutputs == 1); - assert(nbInputs == mFeatmapStrides.size() + 1); + ASSERT(nbOutputs == 1); + ASSERT(nbInputs == mFeatmapStrides.size() + 1); } -size_t MultiLevelRoiAlignPluginDynamic::getWorkspaceSize( +size_t TRTMultiLevelRoiAlign::getWorkspaceSize( const nvinfer1::PluginTensorDesc *inputs, int nbInputs, const nvinfer1::PluginTensorDesc *outputs, int nbOutputs) const { return 0; } -int MultiLevelRoiAlignPluginDynamic::enqueue( - const nvinfer1::PluginTensorDesc *inputDesc, - const nvinfer1::PluginTensorDesc *outputDesc, const void *const *inputs, - void *const *outputs, void *workSpace, cudaStream_t stream) { +int TRTMultiLevelRoiAlign::enqueue(const nvinfer1::PluginTensorDesc *inputDesc, + const nvinfer1::PluginTensorDesc *outputDesc, + const void *const *inputs, + void *const *outputs, void *workSpace, + cudaStream_t stream) { int num_rois = inputDesc[0].dims.d[0]; int batch_size = inputDesc[1].dims.d[0]; int channels = inputDesc[1].dims.d[1]; @@ -117,34 +117,28 @@ int MultiLevelRoiAlignPluginDynamic::enqueue( return 0; } -nvinfer1::DataType MultiLevelRoiAlignPluginDynamic::getOutputDataType( +nvinfer1::DataType TRTMultiLevelRoiAlign::getOutputDataType( int index, const nvinfer1::DataType *inputTypes, int nbInputs) const { return nvinfer1::DataType::kFLOAT; } // IPluginV2 Methods -const char *MultiLevelRoiAlignPluginDynamic::getPluginType() const { - return PLUGIN_NAME; -} +const char *TRTMultiLevelRoiAlign::getPluginType() const { return PLUGIN_NAME; } -const char *MultiLevelRoiAlignPluginDynamic::getPluginVersion() const { +const char *TRTMultiLevelRoiAlign::getPluginVersion() const { return PLUGIN_VERSION; } -int MultiLevelRoiAlignPluginDynamic::getNbOutputs() const { return 1; } - -int MultiLevelRoiAlignPluginDynamic::initialize() { return 0; } - -void MultiLevelRoiAlignPluginDynamic::terminate() {} +int TRTMultiLevelRoiAlign::getNbOutputs() const { return 1; } -size_t MultiLevelRoiAlignPluginDynamic::getSerializationSize() const { +size_t TRTMultiLevelRoiAlign::getSerializationSize() const { return serialized_size(mFeatmapStrides) + serialized_size(mAlignedHeight) + serialized_size(mAlignedWidth) + serialized_size(mSampleNum) + serialized_size(mRoiScaleFactor) + serialized_size(mFinestScale) + serialized_size(mAligned); } -void MultiLevelRoiAlignPluginDynamic::serialize(void *buffer) const { +void TRTMultiLevelRoiAlign::serialize(void *buffer) const { serialize_value(&buffer, mAlignedHeight); serialize_value(&buffer, mAlignedWidth); serialize_value(&buffer, mSampleNum); @@ -154,22 +148,7 @@ void MultiLevelRoiAlignPluginDynamic::serialize(void *buffer) const { serialize_value(&buffer, mFeatmapStrides); } -void MultiLevelRoiAlignPluginDynamic::destroy() { - // This gets called when the network containing plugin is destroyed - delete this; -} - -void MultiLevelRoiAlignPluginDynamic::setPluginNamespace( - const char *libNamespace) { - mNamespace = libNamespace; -} - -const char *MultiLevelRoiAlignPluginDynamic::getPluginNamespace() const { - return mNamespace.c_str(); -} - -MultiLevelRoiAlignPluginDynamicCreator:: - MultiLevelRoiAlignPluginDynamicCreator() { +TRTMultiLevelRoiAlignCreator::TRTMultiLevelRoiAlignCreator() { mPluginAttributes = std::vector( {nvinfer1::PluginField("output_height"), nvinfer1::PluginField("output_width"), @@ -182,20 +161,15 @@ MultiLevelRoiAlignPluginDynamicCreator:: mFC.fields = mPluginAttributes.data(); } -const char *MultiLevelRoiAlignPluginDynamicCreator::getPluginName() const { +const char *TRTMultiLevelRoiAlignCreator::getPluginName() const { return PLUGIN_NAME; } -const char *MultiLevelRoiAlignPluginDynamicCreator::getPluginVersion() const { +const char *TRTMultiLevelRoiAlignCreator::getPluginVersion() const { return PLUGIN_VERSION; } -const nvinfer1::PluginFieldCollection * -MultiLevelRoiAlignPluginDynamicCreator::getFieldNames() { - return &mFC; -} - -nvinfer1::IPluginV2 *MultiLevelRoiAlignPluginDynamicCreator::createPlugin( +nvinfer1::IPluginV2 *TRTMultiLevelRoiAlignCreator::createPlugin( const char *name, const nvinfer1::PluginFieldCollection *fc) { int alignedHeight = 7; int alignedWidth = 7; @@ -231,30 +205,21 @@ nvinfer1::IPluginV2 *MultiLevelRoiAlignPluginDynamicCreator::createPlugin( } } - assert(featmapStrides.size() != 0); + ASSERT(featmapStrides.size() != 0); - MultiLevelRoiAlignPluginDynamic *plugin = new MultiLevelRoiAlignPluginDynamic( + TRTMultiLevelRoiAlign *plugin = new TRTMultiLevelRoiAlign( name, alignedHeight, alignedWidth, sampleNum, featmapStrides, roiScaleFactor, finestScale, aligned); plugin->setPluginNamespace(getPluginNamespace()); return plugin; } -nvinfer1::IPluginV2 *MultiLevelRoiAlignPluginDynamicCreator::deserializePlugin( +nvinfer1::IPluginV2 *TRTMultiLevelRoiAlignCreator::deserializePlugin( const char *name, const void *serialData, size_t serialLength) { - auto plugin = - new MultiLevelRoiAlignPluginDynamic(name, serialData, serialLength); + auto plugin = new TRTMultiLevelRoiAlign(name, serialData, serialLength); plugin->setPluginNamespace(getPluginNamespace()); return plugin; } -void MultiLevelRoiAlignPluginDynamicCreator::setPluginNamespace( - const char *libNamespace) { - mNamespace = libNamespace; -} - -const char *MultiLevelRoiAlignPluginDynamicCreator::getPluginNamespace() const { - return mNamespace.c_str(); -} - -REGISTER_TENSORRT_PLUGIN(MultiLevelRoiAlignPluginDynamicCreator); +REGISTER_TENSORRT_PLUGIN(TRTMultiLevelRoiAlignCreator); +} // namespace mmlab diff --git a/backend_ops/tensorrt/multi_level_roi_align/trt_multi_level_roi_align.hpp b/backend_ops/tensorrt/multi_level_roi_align/trt_multi_level_roi_align.hpp index e8ba617bf6..d55d8cd0fe 100644 --- a/backend_ops/tensorrt/multi_level_roi_align/trt_multi_level_roi_align.hpp +++ b/backend_ops/tensorrt/multi_level_roi_align/trt_multi_level_roi_align.hpp @@ -7,22 +7,21 @@ #include #include -#include "trt_plugin_helper.hpp" +#include "trt_plugin_base.hpp" -class MultiLevelRoiAlignPluginDynamic : public nvinfer1::IPluginV2DynamicExt { +namespace mmlab { +class TRTMultiLevelRoiAlign : public TRTPluginBase { public: - MultiLevelRoiAlignPluginDynamic(const std::string &name, int alignedHeight, - int alignedWidth, int sampleNum, - const std::vector &featmapStrides, - float roiScaleFactor = -1, - int finestScale = 56, bool aligned = false); + TRTMultiLevelRoiAlign(const std::string &name, int alignedHeight, + int alignedWidth, int sampleNum, + const std::vector &featmapStrides, + float roiScaleFactor = -1, int finestScale = 56, + bool aligned = false); - MultiLevelRoiAlignPluginDynamic(const std::string name, const void *data, - size_t length); + TRTMultiLevelRoiAlign(const std::string name, const void *data, + size_t length); - // It doesn't make sense to make MultiLevelRoiAlignPluginDynamic without - // arguments, so we delete default constructor. - MultiLevelRoiAlignPluginDynamic() = delete; + TRTMultiLevelRoiAlign() = delete; // IPluginV2DynamicExt Methods nvinfer1::IPluginV2DynamicExt *clone() const override; @@ -54,18 +53,10 @@ class MultiLevelRoiAlignPluginDynamic : public nvinfer1::IPluginV2DynamicExt { const char *getPluginType() const override; const char *getPluginVersion() const override; int getNbOutputs() const override; - int initialize() override; - void terminate() override; size_t getSerializationSize() const override; void serialize(void *buffer) const override; - void destroy() override; - void setPluginNamespace(const char *pluginNamespace) override; - const char *getPluginNamespace() const override; private: - const std::string mLayerName; - std::string mNamespace; - int mAlignedHeight; int mAlignedWidth; int mSampleNum; @@ -73,43 +64,22 @@ class MultiLevelRoiAlignPluginDynamic : public nvinfer1::IPluginV2DynamicExt { float mRoiScaleFactor; int mFinestScale; bool mAligned; - - protected: - // To prevent compiler warnings. - using nvinfer1::IPluginV2DynamicExt::canBroadcastInputAcrossBatch; - using nvinfer1::IPluginV2DynamicExt::configurePlugin; - using nvinfer1::IPluginV2DynamicExt::enqueue; - using nvinfer1::IPluginV2DynamicExt::getOutputDimensions; - using nvinfer1::IPluginV2DynamicExt::getWorkspaceSize; - using nvinfer1::IPluginV2DynamicExt::isOutputBroadcastAcrossBatch; - using nvinfer1::IPluginV2DynamicExt::supportsFormat; }; -class MultiLevelRoiAlignPluginDynamicCreator : public nvinfer1::IPluginCreator { +class TRTMultiLevelRoiAlignCreator : public TRTPluginCreatorBase { public: - MultiLevelRoiAlignPluginDynamicCreator(); + TRTMultiLevelRoiAlignCreator(); const char *getPluginName() const override; const char *getPluginVersion() const override; - const nvinfer1::PluginFieldCollection *getFieldNames() override; - nvinfer1::IPluginV2 *createPlugin( const char *name, const nvinfer1::PluginFieldCollection *fc) override; nvinfer1::IPluginV2 *deserializePlugin(const char *name, const void *serialData, size_t serialLength) override; - - void setPluginNamespace(const char *pluginNamespace) override; - - const char *getPluginNamespace() const override; - - private: - nvinfer1::PluginFieldCollection mFC; - std::vector mPluginAttributes; - std::string mNamespace; }; - +} // namespace mmlab #endif // TRT_ROI_ALIGN_HPP diff --git a/backend_ops/tensorrt/multi_level_roi_align/trt_multi_level_roi_align_kernel.cu b/backend_ops/tensorrt/multi_level_roi_align/trt_multi_level_roi_align_kernel.cu index a58fcfdfda..f85d349e1d 100644 --- a/backend_ops/tensorrt/multi_level_roi_align/trt_multi_level_roi_align_kernel.cu +++ b/backend_ops/tensorrt/multi_level_roi_align/trt_multi_level_roi_align_kernel.cu @@ -4,7 +4,6 @@ #include #include "common_cuda_helper.hpp" -#include "trt_cuda_helper.cuh" #include "trt_multi_level_roi_align_kernel.hpp" #include "trt_plugin_helper.hpp" @@ -19,53 +18,6 @@ struct FeatData { int num_featmap; }; -template -__device__ scalar_t bilinear_interpolate(const scalar_t *bottom_data, - const int height, const int width, - scalar_t y, scalar_t x) { - // deal with cases that inverse elements are out of feature map boundary - if (y < -1.0 || y > height || x < -1.0 || x > width) { - return 0; - } - - if (y <= 0) y = 0; - if (x <= 0) x = 0; - - int y_low = (int)y; - int x_low = (int)x; - int y_high; - int x_high; - - if (y_low >= height - 1) { - y_high = y_low = height - 1; - y = (scalar_t)y_low; - } else { - y_high = y_low + 1; - } - - if (x_low >= width - 1) { - x_high = x_low = width - 1; - x = (scalar_t)x_low; - } else { - x_high = x_low + 1; - } - - scalar_t ly = y - y_low; - scalar_t lx = x - x_low; - scalar_t hy = 1. - ly; - scalar_t hx = 1. - lx; - // do bilinear interpolation - scalar_t lt = bottom_data[y_low * width + x_low]; - scalar_t rt = bottom_data[y_low * width + x_high]; - scalar_t lb = bottom_data[y_high * width + x_low]; - scalar_t rb = bottom_data[y_high * width + x_high]; - scalar_t w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx; - - scalar_t val = (w1 * lt + w2 * rt + w3 * lb + w4 * rb); - - return val; -} - template __device__ scalar_t roi_align_single( const scalar_t *bottom_data, const int roi_batch_ind, diff --git a/backend_ops/tensorrt/nms/trt_nms.cpp b/backend_ops/tensorrt/nms/trt_nms.cpp index d82be1bd3c..3335b448d7 100644 --- a/backend_ops/tensorrt/nms/trt_nms.cpp +++ b/backend_ops/tensorrt/nms/trt_nms.cpp @@ -5,39 +5,26 @@ #include +#include "trt_nms_kernel.hpp" #include "trt_serialize.hpp" - -extern size_t get_onnxnms_workspace_size( - size_t num_batches, size_t spatial_dimension, size_t num_classes, - size_t boxes_word_size, int center_point_box, size_t output_length); - -extern void TRTNMSCUDAKernelLauncher_float( - const float *boxes, const float *scores, - const int max_output_boxes_per_class, const float iou_threshold, - const float score_threshold, const int offset, int *output, - int center_point_box, int num_batches, int spatial_dimension, - int num_classes, size_t output_length, void *workspace, - cudaStream_t stream); - +namespace mmlab { namespace { static const char *PLUGIN_VERSION{"1"}; static const char *PLUGIN_NAME{"NonMaxSuppression"}; } // namespace -NonMaxSuppressionDynamic::NonMaxSuppressionDynamic( - const std::string &name, int centerPointBox, int maxOutputBoxesPerClass, - float iouThreshold, float scoreThreshold, int offset) - : mLayerName(name), +TRTNMS::TRTNMS(const std::string &name, int centerPointBox, + int maxOutputBoxesPerClass, float iouThreshold, + float scoreThreshold, int offset) + : TRTPluginBase(name), mCenterPointBox(centerPointBox), mMaxOutputBoxesPerClass(maxOutputBoxesPerClass), mIouThreshold(iouThreshold), mScoreThreshold(scoreThreshold), mOffset(offset) {} -NonMaxSuppressionDynamic::NonMaxSuppressionDynamic(const std::string name, - const void *data, - size_t length) - : mLayerName(name) { +TRTNMS::TRTNMS(const std::string name, const void *data, size_t length) + : TRTPluginBase(name) { deserialize_value(&data, &length, &mCenterPointBox); deserialize_value(&data, &length, &mMaxOutputBoxesPerClass); deserialize_value(&data, &length, &mIouThreshold); @@ -45,16 +32,16 @@ NonMaxSuppressionDynamic::NonMaxSuppressionDynamic(const std::string name, deserialize_value(&data, &length, &mOffset); } -nvinfer1::IPluginV2DynamicExt *NonMaxSuppressionDynamic::clone() const { - NonMaxSuppressionDynamic *plugin = new NonMaxSuppressionDynamic( - mLayerName, mCenterPointBox, mMaxOutputBoxesPerClass, mIouThreshold, - mScoreThreshold, mOffset); +nvinfer1::IPluginV2DynamicExt *TRTNMS::clone() const { + TRTNMS *plugin = + new TRTNMS(mLayerName, mCenterPointBox, mMaxOutputBoxesPerClass, + mIouThreshold, mScoreThreshold, mOffset); plugin->setPluginNamespace(getPluginNamespace()); return plugin; } -nvinfer1::DimsExprs NonMaxSuppressionDynamic::getOutputDimensions( +nvinfer1::DimsExprs TRTNMS::getOutputDimensions( int outputIndex, const nvinfer1::DimsExprs *inputs, int nbInputs, nvinfer1::IExprBuilder &exprBuilder) { nvinfer1::DimsExprs ret; @@ -76,9 +63,9 @@ nvinfer1::DimsExprs NonMaxSuppressionDynamic::getOutputDimensions( return ret; } -bool NonMaxSuppressionDynamic::supportsFormatCombination( - int pos, const nvinfer1::PluginTensorDesc *inOut, int nbInputs, - int nbOutputs) { +bool TRTNMS::supportsFormatCombination(int pos, + const nvinfer1::PluginTensorDesc *inOut, + int nbInputs, int nbOutputs) { if (pos < nbInputs) { switch (pos) { case 0: @@ -105,13 +92,15 @@ bool NonMaxSuppressionDynamic::supportsFormatCombination( return true; } -void NonMaxSuppressionDynamic::configurePlugin( - const nvinfer1::DynamicPluginTensorDesc *inputs, int nbInputs, - const nvinfer1::DynamicPluginTensorDesc *outputs, int nbOutputs) {} +void TRTNMS::configurePlugin(const nvinfer1::DynamicPluginTensorDesc *inputs, + int nbInputs, + const nvinfer1::DynamicPluginTensorDesc *outputs, + int nbOutputs) {} -size_t NonMaxSuppressionDynamic::getWorkspaceSize( - const nvinfer1::PluginTensorDesc *inputs, int nbInputs, - const nvinfer1::PluginTensorDesc *outputs, int nbOutputs) const { +size_t TRTNMS::getWorkspaceSize(const nvinfer1::PluginTensorDesc *inputs, + int nbInputs, + const nvinfer1::PluginTensorDesc *outputs, + int nbOutputs) const { size_t boxes_word_size = mmlab::getElementSize(inputs[0].type); size_t num_batches = inputs[0].dims.d[0]; size_t spatial_dimension = inputs[0].dims.d[1]; @@ -123,10 +112,10 @@ size_t NonMaxSuppressionDynamic::getWorkspaceSize( output_length); } -int NonMaxSuppressionDynamic::enqueue( - const nvinfer1::PluginTensorDesc *inputDesc, - const nvinfer1::PluginTensorDesc *outputDesc, const void *const *inputs, - void *const *outputs, void *workSpace, cudaStream_t stream) { +int TRTNMS::enqueue(const nvinfer1::PluginTensorDesc *inputDesc, + const nvinfer1::PluginTensorDesc *outputDesc, + const void *const *inputs, void *const *outputs, + void *workSpace, cudaStream_t stream) { int num_batches = inputDesc[0].dims.d[0]; int spatial_dimension = inputDesc[0].dims.d[1]; int num_classes = inputDesc[1].dims.d[1]; @@ -135,42 +124,34 @@ int NonMaxSuppressionDynamic::enqueue( const float *boxes = (const float *)inputs[0]; const float *scores = (const float *)inputs[1]; int *output = (int *)outputs[0]; - TRTNMSCUDAKernelLauncher_float( - boxes, scores, mMaxOutputBoxesPerClass, mIouThreshold, mScoreThreshold, - mOffset, output, mCenterPointBox, num_batches, spatial_dimension, - num_classes, output_length, workSpace, stream); + NMSCUDAKernelLauncher_float(boxes, scores, mMaxOutputBoxesPerClass, + mIouThreshold, mScoreThreshold, mOffset, output, + mCenterPointBox, num_batches, spatial_dimension, + num_classes, output_length, workSpace, stream); return 0; } -nvinfer1::DataType NonMaxSuppressionDynamic::getOutputDataType( +nvinfer1::DataType TRTNMS::getOutputDataType( int index, const nvinfer1::DataType *inputTypes, int nbInputs) const { return nvinfer1::DataType::kINT32; } // IPluginV2 Methods -const char *NonMaxSuppressionDynamic::getPluginType() const { - return PLUGIN_NAME; -} - -const char *NonMaxSuppressionDynamic::getPluginVersion() const { - return PLUGIN_VERSION; -} +const char *TRTNMS::getPluginType() const { return PLUGIN_NAME; } -int NonMaxSuppressionDynamic::getNbOutputs() const { return 1; } +const char *TRTNMS::getPluginVersion() const { return PLUGIN_VERSION; } -int NonMaxSuppressionDynamic::initialize() { return 0; } +int TRTNMS::getNbOutputs() const { return 1; } -void NonMaxSuppressionDynamic::terminate() {} - -size_t NonMaxSuppressionDynamic::getSerializationSize() const { +size_t TRTNMS::getSerializationSize() const { return serialized_size(mCenterPointBox) + serialized_size(mMaxOutputBoxesPerClass) + serialized_size(mIouThreshold) + serialized_size(mScoreThreshold) + serialized_size(mOffset); } -void NonMaxSuppressionDynamic::serialize(void *buffer) const { +void TRTNMS::serialize(void *buffer) const { serialize_value(&buffer, mCenterPointBox); serialize_value(&buffer, mMaxOutputBoxesPerClass); serialize_value(&buffer, mIouThreshold); @@ -178,22 +159,7 @@ void NonMaxSuppressionDynamic::serialize(void *buffer) const { serialize_value(&buffer, mOffset); } -void NonMaxSuppressionDynamic::destroy() { - // This gets called when the network containing plugin is destroyed - delete this; -} - -void NonMaxSuppressionDynamic::setPluginNamespace(const char *libNamespace) { - mNamespace = libNamespace; -} - -const char *NonMaxSuppressionDynamic::getPluginNamespace() const { - return mNamespace.c_str(); -} - -////////////////////// creator ///////////////////////////// - -NonMaxSuppressionDynamicCreator::NonMaxSuppressionDynamicCreator() { +TRTNMSCreator::TRTNMSCreator() { mPluginAttributes.clear(); mPluginAttributes.emplace_back(nvinfer1::PluginField("center_point_box")); mPluginAttributes.emplace_back( @@ -205,20 +171,11 @@ NonMaxSuppressionDynamicCreator::NonMaxSuppressionDynamicCreator() { mFC.fields = mPluginAttributes.data(); } -const char *NonMaxSuppressionDynamicCreator::getPluginName() const { - return PLUGIN_NAME; -} - -const char *NonMaxSuppressionDynamicCreator::getPluginVersion() const { - return PLUGIN_VERSION; -} +const char *TRTNMSCreator::getPluginName() const { return PLUGIN_NAME; } -const nvinfer1::PluginFieldCollection * -NonMaxSuppressionDynamicCreator::getFieldNames() { - return &mFC; -} +const char *TRTNMSCreator::getPluginVersion() const { return PLUGIN_VERSION; } -nvinfer1::IPluginV2 *NonMaxSuppressionDynamicCreator::createPlugin( +nvinfer1::IPluginV2 *TRTNMSCreator::createPlugin( const char *name, const nvinfer1::PluginFieldCollection *fc) { int centerPointBox = 0; int maxOutputBoxesPerClass = 0; @@ -252,27 +209,18 @@ nvinfer1::IPluginV2 *NonMaxSuppressionDynamicCreator::createPlugin( offset = static_cast(fc->fields[i].data)[0]; } } - NonMaxSuppressionDynamic *plugin = - new NonMaxSuppressionDynamic(name, centerPointBox, maxOutputBoxesPerClass, - iouThreshold, scoreThreshold, offset); + TRTNMS *plugin = new TRTNMS(name, centerPointBox, maxOutputBoxesPerClass, + iouThreshold, scoreThreshold, offset); plugin->setPluginNamespace(getPluginNamespace()); return plugin; } -nvinfer1::IPluginV2 *NonMaxSuppressionDynamicCreator::deserializePlugin( - const char *name, const void *serialData, size_t serialLength) { - auto plugin = new NonMaxSuppressionDynamic(name, serialData, serialLength); +nvinfer1::IPluginV2 *TRTNMSCreator::deserializePlugin(const char *name, + const void *serialData, + size_t serialLength) { + auto plugin = new TRTNMS(name, serialData, serialLength); plugin->setPluginNamespace(getPluginNamespace()); return plugin; } - -void NonMaxSuppressionDynamicCreator::setPluginNamespace( - const char *libNamespace) { - mNamespace = libNamespace; -} - -const char *NonMaxSuppressionDynamicCreator::getPluginNamespace() const { - return mNamespace.c_str(); -} - -REGISTER_TENSORRT_PLUGIN(NonMaxSuppressionDynamicCreator); +REGISTER_TENSORRT_PLUGIN(TRTNMSCreator); +} // namespace mmlab diff --git a/backend_ops/tensorrt/nms/trt_nms.hpp b/backend_ops/tensorrt/nms/trt_nms.hpp index a696ea73c5..1970113365 100644 --- a/backend_ops/tensorrt/nms/trt_nms.hpp +++ b/backend_ops/tensorrt/nms/trt_nms.hpp @@ -6,18 +6,17 @@ #include #include -#include "trt_plugin_helper.hpp" - -class NonMaxSuppressionDynamic : public nvinfer1::IPluginV2DynamicExt { +#include "trt_plugin_base.hpp" +namespace mmlab { +class TRTNMS : public TRTPluginBase { public: - NonMaxSuppressionDynamic(const std::string &name, int centerPointBox, - int maxOutputBoxesPerClass, float iouThreshold, - float scoreThreshold, int offset); + TRTNMS(const std::string &name, int centerPointBox, + int maxOutputBoxesPerClass, float iouThreshold, float scoreThreshold, + int offset); - NonMaxSuppressionDynamic(const std::string name, const void *data, - size_t length); + TRTNMS(const std::string name, const void *data, size_t length); - NonMaxSuppressionDynamic() = delete; + TRTNMS() = delete; // IPluginV2DynamicExt Methods nvinfer1::IPluginV2DynamicExt *clone() const override; @@ -49,59 +48,31 @@ class NonMaxSuppressionDynamic : public nvinfer1::IPluginV2DynamicExt { const char *getPluginType() const override; const char *getPluginVersion() const override; int getNbOutputs() const override; - int initialize() override; - void terminate() override; size_t getSerializationSize() const override; void serialize(void *buffer) const override; - void destroy() override; - void setPluginNamespace(const char *pluginNamespace) override; - const char *getPluginNamespace() const override; private: - const std::string mLayerName; - std::string mNamespace; - int mCenterPointBox; int mMaxOutputBoxesPerClass; float mIouThreshold; float mScoreThreshold; int mOffset; - - protected: - // To prevent compiler warnings. - using nvinfer1::IPluginV2DynamicExt::canBroadcastInputAcrossBatch; - using nvinfer1::IPluginV2DynamicExt::configurePlugin; - using nvinfer1::IPluginV2DynamicExt::enqueue; - using nvinfer1::IPluginV2DynamicExt::getOutputDimensions; - using nvinfer1::IPluginV2DynamicExt::getWorkspaceSize; - using nvinfer1::IPluginV2DynamicExt::isOutputBroadcastAcrossBatch; - using nvinfer1::IPluginV2DynamicExt::supportsFormat; }; -class NonMaxSuppressionDynamicCreator : public nvinfer1::IPluginCreator { +class TRTNMSCreator : public TRTPluginCreatorBase { public: - NonMaxSuppressionDynamicCreator(); + TRTNMSCreator(); const char *getPluginName() const override; const char *getPluginVersion() const override; - const nvinfer1::PluginFieldCollection *getFieldNames() override; - nvinfer1::IPluginV2 *createPlugin( const char *name, const nvinfer1::PluginFieldCollection *fc) override; nvinfer1::IPluginV2 *deserializePlugin(const char *name, const void *serialData, size_t serialLength) override; - - void setPluginNamespace(const char *pluginNamespace) override; - - const char *getPluginNamespace() const override; - - private: - nvinfer1::PluginFieldCollection mFC; - std::vector mPluginAttributes; - std::string mNamespace; }; +} // namespace mmlab #endif // TRT_NMS_HPP diff --git a/backend_ops/tensorrt/nms/trt_nms_kernel.cu b/backend_ops/tensorrt/nms/trt_nms_kernel.cu index bd12f3ac55..324db93a70 100644 --- a/backend_ops/tensorrt/nms/trt_nms_kernel.cu +++ b/backend_ops/tensorrt/nms/trt_nms_kernel.cu @@ -1,3 +1,4 @@ +#include #include #include #include @@ -9,8 +10,7 @@ #include #include "common_cuda_helper.hpp" -#include "trt_cuda_helper.cuh" -#include "trt_nms_kernel.cuh" +#include "trt_nms_kernel.hpp" #include "trt_plugin_helper.hpp" struct NMSBox { @@ -50,6 +50,65 @@ struct nms_score_threshold { } }; +static int const threadsPerBlock = sizeof(unsigned long long int) * 8; + +__device__ inline bool devIoU(float const* const a, float const* const b, + const int offset, const float threshold) { + float left = fmaxf(a[0], b[0]), right = fminf(a[2], b[2]); + float top = fmaxf(a[1], b[1]), bottom = fminf(a[3], b[3]); + float width = fmaxf(right - left + offset, 0.f), + height = fmaxf(bottom - top + offset, 0.f); + float interS = width * height; + float Sa = (a[2] - a[0] + offset) * (a[3] - a[1] + offset); + float Sb = (b[2] - b[0] + offset) * (b[3] - b[1] + offset); + return interS > threshold * (Sa + Sb - interS); +} + +__global__ void nms_cuda(const int n_boxes, const float iou_threshold, + const int offset, const float* dev_boxes, + unsigned long long* dev_mask) { + const int row_start = blockIdx.y; + const int col_start = blockIdx.x; + const int tid = threadIdx.x; + + if (row_start > col_start) return; + + const int row_size = + fminf(n_boxes - row_start * threadsPerBlock, threadsPerBlock); + const int col_size = + fminf(n_boxes - col_start * threadsPerBlock, threadsPerBlock); + + __shared__ float block_boxes[threadsPerBlock * 4]; + if (tid < col_size) { + block_boxes[tid * 4 + 0] = + dev_boxes[(threadsPerBlock * col_start + tid) * 4 + 0]; + block_boxes[tid * 4 + 1] = + dev_boxes[(threadsPerBlock * col_start + tid) * 4 + 1]; + block_boxes[tid * 4 + 2] = + dev_boxes[(threadsPerBlock * col_start + tid) * 4 + 2]; + block_boxes[tid * 4 + 3] = + dev_boxes[(threadsPerBlock * col_start + tid) * 4 + 3]; + } + __syncthreads(); + + if (tid < row_size) { + const int cur_box_idx = threadsPerBlock * row_start + tid; + const float* cur_box = dev_boxes + cur_box_idx * 4; + int i = 0; + unsigned long long int t = 0; + int start = 0; + if (row_start == col_start) { + start = tid + 1; + } + for (i = start; i < col_size; i++) { + if (devIoU(cur_box, block_boxes + i * 4, offset, iou_threshold)) { + t |= 1ULL << i; + } + } + dev_mask[cur_box_idx * gridDim.y + col_start] = t; + } +} + __global__ void nms_reindex_kernel(int n, int* output, int* index_cache) { CUDA_1D_KERNEL_LOOP(index, n) { const int old_index = output[index * 3 + 2]; @@ -152,15 +211,14 @@ size_t get_onnxnms_workspace_size(size_t num_batches, size_t spatial_dimension, * @param[in] workspace memory for all temporary variables. * @param[in] stream cuda stream */ -void TRTNMSCUDAKernelLauncher_float(const float* boxes, const float* scores, - const int max_output_boxes_per_class, - const float iou_threshold, - const float score_threshold, - const int offset, int* output, - int center_point_box, int num_batches, - int spatial_dimension, int num_classes, - size_t output_length, void* workspace, - cudaStream_t stream) { +void NMSCUDAKernelLauncher_float(const float* boxes, const float* scores, + const int max_output_boxes_per_class, + const float iou_threshold, + const float score_threshold, const int offset, + int* output, int center_point_box, + int num_batches, int spatial_dimension, + int num_classes, size_t output_length, + void* workspace, cudaStream_t stream) { using mmlab::getAlignedSize; const int col_blocks = DIVUP(spatial_dimension, THREADS_PER_BLOCK); float* boxes_sorted = (float*)workspace; diff --git a/backend_ops/tensorrt/nms/trt_nms_kernel.cuh b/backend_ops/tensorrt/nms/trt_nms_kernel.cuh deleted file mode 100644 index 9238aefefe..0000000000 --- a/backend_ops/tensorrt/nms/trt_nms_kernel.cuh +++ /dev/null @@ -1,67 +0,0 @@ -#ifndef NMS_CUDA_KERNEL_CUH -#define NMS_CUDA_KERNEL_CUH - -#include - -#include "common_cuda_helper.hpp" - -#define DIVUP(m, n) ((m) / (n) + ((m) % (n) > 0)) -int const threadsPerBlock = sizeof(unsigned long long int) * 8; - -__device__ inline bool devIoU(float const *const a, float const *const b, - const int offset, const float threshold) { - float left = fmaxf(a[0], b[0]), right = fminf(a[2], b[2]); - float top = fmaxf(a[1], b[1]), bottom = fminf(a[3], b[3]); - float width = fmaxf(right - left + offset, 0.f), - height = fmaxf(bottom - top + offset, 0.f); - float interS = width * height; - float Sa = (a[2] - a[0] + offset) * (a[3] - a[1] + offset); - float Sb = (b[2] - b[0] + offset) * (b[3] - b[1] + offset); - return interS > threshold * (Sa + Sb - interS); -} - -__global__ void nms_cuda(const int n_boxes, const float iou_threshold, - const int offset, const float *dev_boxes, - unsigned long long *dev_mask) { - const int row_start = blockIdx.y; - const int col_start = blockIdx.x; - const int tid = threadIdx.x; - - if (row_start > col_start) return; - - const int row_size = - fminf(n_boxes - row_start * threadsPerBlock, threadsPerBlock); - const int col_size = - fminf(n_boxes - col_start * threadsPerBlock, threadsPerBlock); - - __shared__ float block_boxes[threadsPerBlock * 4]; - if (tid < col_size) { - block_boxes[tid * 4 + 0] = - dev_boxes[(threadsPerBlock * col_start + tid) * 4 + 0]; - block_boxes[tid * 4 + 1] = - dev_boxes[(threadsPerBlock * col_start + tid) * 4 + 1]; - block_boxes[tid * 4 + 2] = - dev_boxes[(threadsPerBlock * col_start + tid) * 4 + 2]; - block_boxes[tid * 4 + 3] = - dev_boxes[(threadsPerBlock * col_start + tid) * 4 + 3]; - } - __syncthreads(); - - if (tid < row_size) { - const int cur_box_idx = threadsPerBlock * row_start + tid; - const float *cur_box = dev_boxes + cur_box_idx * 4; - int i = 0; - unsigned long long int t = 0; - int start = 0; - if (row_start == col_start) { - start = tid + 1; - } - for (i = start; i < col_size; i++) { - if (devIoU(cur_box, block_boxes + i * 4, offset, iou_threshold)) { - t |= 1ULL << i; - } - } - dev_mask[cur_box_idx * gridDim.y + col_start] = t; - } -} -#endif // NMS_CUDA_KERNEL_CUH diff --git a/backend_ops/tensorrt/nms/trt_nms_kernel.hpp b/backend_ops/tensorrt/nms/trt_nms_kernel.hpp new file mode 100644 index 0000000000..91a522cb53 --- /dev/null +++ b/backend_ops/tensorrt/nms/trt_nms_kernel.hpp @@ -0,0 +1,18 @@ +#ifndef NMS_CUDA_KERNEL_HPP +#define NMS_CUDA_KERNEL_HPP + +#include "common_cuda_helper.hpp" + +size_t get_onnxnms_workspace_size(size_t num_batches, size_t spatial_dimension, + size_t num_classes, size_t boxes_word_size, + int center_point_box, size_t output_length); + +void NMSCUDAKernelLauncher_float(const float *boxes, const float *scores, + const int max_output_boxes_per_class, + const float iou_threshold, + const float score_threshold, const int offset, + int *output, int center_point_box, + int num_batches, int spatial_dimension, + int num_classes, size_t output_length, + void *workspace, cudaStream_t stream); +#endif // NMS_CUDA_KERNEL_HPP diff --git a/backend_ops/tensorrt/roi_align/trt_roi_align.cpp b/backend_ops/tensorrt/roi_align/trt_roi_align.cpp index 0c14c96745..f646306beb 100644 --- a/backend_ops/tensorrt/roi_align/trt_roi_align.cpp +++ b/backend_ops/tensorrt/roi_align/trt_roi_align.cpp @@ -1,28 +1,23 @@ #include "trt_roi_align.hpp" -#include - #include +#include +#include "common_cuda_helper.hpp" +#include "trt_plugin_helper.hpp" +#include "trt_roi_align_kernel.hpp" #include "trt_serialize.hpp" -extern void TRTRoIAlignForwardCUDAKernelLauncher_float( - const float *input, const float *rois, float *output, float *argmax_y, - float *argmax_x, int output_size, int channels, int height, int width, - int aligned_height, int aligned_width, float spatial_scale, - int sampling_ratio, int pool_mode, bool aligned, cudaStream_t stream); - +namespace mmlab { namespace { static const char *PLUGIN_VERSION{"1"}; static const char *PLUGIN_NAME{"MMCVRoiAlign"}; } // namespace -RoIAlignPluginDynamic::RoIAlignPluginDynamic(const std::string &name, - int outWidth, int outHeight, - float spatialScale, - int sampleRatio, int poolMode, - bool aligned) - : mLayerName(name), +TRTRoIAlign::TRTRoIAlign(const std::string &name, int outWidth, int outHeight, + float spatialScale, int sampleRatio, int poolMode, + bool aligned) + : TRTPluginBase(name), mOutWidth(outWidth), mOutHeight(outHeight), mSpatialScale(spatialScale), @@ -30,9 +25,9 @@ RoIAlignPluginDynamic::RoIAlignPluginDynamic(const std::string &name, mPoolMode(poolMode), mAligned(aligned) {} -RoIAlignPluginDynamic::RoIAlignPluginDynamic(const std::string name, - const void *data, size_t length) - : mLayerName(name) { +TRTRoIAlign::TRTRoIAlign(const std::string name, const void *data, + size_t length) + : TRTPluginBase(name) { deserialize_value(&data, &length, &mOutWidth); deserialize_value(&data, &length, &mOutHeight); deserialize_value(&data, &length, &mSpatialScale); @@ -41,16 +36,16 @@ RoIAlignPluginDynamic::RoIAlignPluginDynamic(const std::string name, deserialize_value(&data, &length, &mAligned); } -nvinfer1::IPluginV2DynamicExt *RoIAlignPluginDynamic::clone() const { - RoIAlignPluginDynamic *plugin = new RoIAlignPluginDynamic( - mLayerName, mOutWidth, mOutHeight, mSpatialScale, mSampleRatio, mPoolMode, - mAligned); +nvinfer1::IPluginV2DynamicExt *TRTRoIAlign::clone() const { + TRTRoIAlign *plugin = + new TRTRoIAlign(mLayerName, mOutWidth, mOutHeight, mSpatialScale, + mSampleRatio, mPoolMode, mAligned); plugin->setPluginNamespace(getPluginNamespace()); return plugin; } -nvinfer1::DimsExprs RoIAlignPluginDynamic::getOutputDimensions( +nvinfer1::DimsExprs TRTRoIAlign::getOutputDimensions( int outputIndex, const nvinfer1::DimsExprs *inputs, int nbInputs, nvinfer1::IExprBuilder &exprBuilder) { nvinfer1::DimsExprs ret; @@ -63,20 +58,21 @@ nvinfer1::DimsExprs RoIAlignPluginDynamic::getOutputDimensions( return ret; } -bool RoIAlignPluginDynamic::supportsFormatCombination( +bool TRTRoIAlign::supportsFormatCombination( int pos, const nvinfer1::PluginTensorDesc *inOut, int nbInputs, int nbOutputs) { return inOut[pos].type == nvinfer1::DataType::kFLOAT && inOut[pos].format == nvinfer1::TensorFormat::kLINEAR; } -void RoIAlignPluginDynamic::configurePlugin( +void TRTRoIAlign::configurePlugin( const nvinfer1::DynamicPluginTensorDesc *inputs, int nbInputs, const nvinfer1::DynamicPluginTensorDesc *outputs, int nbOutputs) {} -size_t RoIAlignPluginDynamic::getWorkspaceSize( - const nvinfer1::PluginTensorDesc *inputs, int nbInputs, - const nvinfer1::PluginTensorDesc *outputs, int nbOutputs) const { +size_t TRTRoIAlign::getWorkspaceSize(const nvinfer1::PluginTensorDesc *inputs, + int nbInputs, + const nvinfer1::PluginTensorDesc *outputs, + int nbOutputs) const { size_t output_size = 0; size_t word_size = 0; switch (mPoolMode) { @@ -95,11 +91,10 @@ size_t RoIAlignPluginDynamic::getWorkspaceSize( return 0; } -int RoIAlignPluginDynamic::enqueue(const nvinfer1::PluginTensorDesc *inputDesc, - const nvinfer1::PluginTensorDesc *outputDesc, - const void *const *inputs, - void *const *outputs, void *workSpace, - cudaStream_t stream) { +int TRTRoIAlign::enqueue(const nvinfer1::PluginTensorDesc *inputDesc, + const nvinfer1::PluginTensorDesc *outputDesc, + const void *const *inputs, void *const *outputs, + void *workSpace, cudaStream_t stream) { int channels = inputDesc[0].dims.d[1]; int height = inputDesc[0].dims.d[2]; int width = inputDesc[0].dims.d[3]; @@ -125,7 +120,7 @@ int RoIAlignPluginDynamic::enqueue(const nvinfer1::PluginTensorDesc *inputDesc, switch (outputDesc[0].type) { case nvinfer1::DataType::kFLOAT: - TRTRoIAlignForwardCUDAKernelLauncher_float( + TRTRoIAlignForwardCUDAKernelLauncher( (const float *)feat, (const float *)rois, (float *)output, (float *)argmax_y, (float *)argmax_x, output_size, channels, height, width, mOutHeight, mOutWidth, mSpatialScale, mSampleRatio, mPoolMode, @@ -139,31 +134,25 @@ int RoIAlignPluginDynamic::enqueue(const nvinfer1::PluginTensorDesc *inputDesc, return 0; } -nvinfer1::DataType RoIAlignPluginDynamic::getOutputDataType( +nvinfer1::DataType TRTRoIAlign::getOutputDataType( int index, const nvinfer1::DataType *inputTypes, int nbInputs) const { return inputTypes[0]; } // IPluginV2 Methods -const char *RoIAlignPluginDynamic::getPluginType() const { return PLUGIN_NAME; } - -const char *RoIAlignPluginDynamic::getPluginVersion() const { - return PLUGIN_VERSION; -} - -int RoIAlignPluginDynamic::getNbOutputs() const { return 1; } +const char *TRTRoIAlign::getPluginType() const { return PLUGIN_NAME; } -int RoIAlignPluginDynamic::initialize() { return 0; } +const char *TRTRoIAlign::getPluginVersion() const { return PLUGIN_VERSION; } -void RoIAlignPluginDynamic::terminate() {} +int TRTRoIAlign::getNbOutputs() const { return 1; } -size_t RoIAlignPluginDynamic::getSerializationSize() const { +size_t TRTRoIAlign::getSerializationSize() const { return serialized_size(mOutWidth) + serialized_size(mOutHeight) + serialized_size(mSpatialScale) + serialized_size(mSampleRatio) + serialized_size(mPoolMode) + serialized_size(mAligned); } -void RoIAlignPluginDynamic::serialize(void *buffer) const { +void TRTRoIAlign::serialize(void *buffer) const { serialize_value(&buffer, mOutWidth); serialize_value(&buffer, mOutHeight); serialize_value(&buffer, mSpatialScale); @@ -172,22 +161,7 @@ void RoIAlignPluginDynamic::serialize(void *buffer) const { serialize_value(&buffer, mAligned); } -void RoIAlignPluginDynamic::destroy() { - // This gets called when the network containing plugin is destroyed - delete this; -} - -void RoIAlignPluginDynamic::setPluginNamespace(const char *libNamespace) { - mNamespace = libNamespace; -} - -const char *RoIAlignPluginDynamic::getPluginNamespace() const { - return mNamespace.c_str(); -} - -////////////////////// creator ///////////////////////////// - -RoIAlignPluginDynamicCreator::RoIAlignPluginDynamicCreator() { +TRTRoIAlignCreator::TRTRoIAlignCreator() { mPluginAttributes.emplace_back(nvinfer1::PluginField("output_height")); mPluginAttributes.emplace_back(nvinfer1::PluginField("output_width")); mPluginAttributes.emplace_back(nvinfer1::PluginField("spatial_scale")); @@ -198,20 +172,13 @@ RoIAlignPluginDynamicCreator::RoIAlignPluginDynamicCreator() { mFC.fields = mPluginAttributes.data(); } -const char *RoIAlignPluginDynamicCreator::getPluginName() const { - return PLUGIN_NAME; -} +const char *TRTRoIAlignCreator::getPluginName() const { return PLUGIN_NAME; } -const char *RoIAlignPluginDynamicCreator::getPluginVersion() const { +const char *TRTRoIAlignCreator::getPluginVersion() const { return PLUGIN_VERSION; } -const nvinfer1::PluginFieldCollection * -RoIAlignPluginDynamicCreator::getFieldNames() { - return &mFC; -} - -nvinfer1::IPluginV2 *RoIAlignPluginDynamicCreator::createPlugin( +nvinfer1::IPluginV2 *TRTRoIAlignCreator::createPlugin( const char *name, const nvinfer1::PluginFieldCollection *fc) { int outWidth = 7; int outHeight = 7; @@ -253,7 +220,7 @@ nvinfer1::IPluginV2 *RoIAlignPluginDynamicCreator::createPlugin( std::cout << "Unknown pool mode \"" << poolModeStr << "\"." << std::endl; } - assert(poolMode >= 0); + ASSERT(poolMode >= 0); } if (field_name.compare("aligned") == 0) { @@ -262,31 +229,22 @@ nvinfer1::IPluginV2 *RoIAlignPluginDynamicCreator::createPlugin( } } - assert(outHeight > 0); - assert(outWidth > 0); - assert(spatialScale > 0.); - assert(poolMode >= 0); + ASSERT(outHeight > 0); + ASSERT(outWidth > 0); + ASSERT(spatialScale > 0.); + ASSERT(poolMode >= 0); - RoIAlignPluginDynamic *plugin = new RoIAlignPluginDynamic( - name, outWidth, outHeight, spatialScale, sampleRatio, poolMode, aligned); + TRTRoIAlign *plugin = new TRTRoIAlign(name, outWidth, outHeight, spatialScale, + sampleRatio, poolMode, aligned); plugin->setPluginNamespace(getPluginNamespace()); return plugin; } -nvinfer1::IPluginV2 *RoIAlignPluginDynamicCreator::deserializePlugin( +nvinfer1::IPluginV2 *TRTRoIAlignCreator::deserializePlugin( const char *name, const void *serialData, size_t serialLength) { - auto plugin = new RoIAlignPluginDynamic(name, serialData, serialLength); + auto plugin = new TRTRoIAlign(name, serialData, serialLength); plugin->setPluginNamespace(getPluginNamespace()); return plugin; } - -void RoIAlignPluginDynamicCreator::setPluginNamespace( - const char *libNamespace) { - mNamespace = libNamespace; -} - -const char *RoIAlignPluginDynamicCreator::getPluginNamespace() const { - return mNamespace.c_str(); -} - -REGISTER_TENSORRT_PLUGIN(RoIAlignPluginDynamicCreator); +REGISTER_TENSORRT_PLUGIN(TRTRoIAlignCreator); +} // namespace mmlab diff --git a/backend_ops/tensorrt/roi_align/trt_roi_align.hpp b/backend_ops/tensorrt/roi_align/trt_roi_align.hpp index 00b536c18b..bc35d5d7b0 100644 --- a/backend_ops/tensorrt/roi_align/trt_roi_align.hpp +++ b/backend_ops/tensorrt/roi_align/trt_roi_align.hpp @@ -6,18 +6,16 @@ #include #include -#include "trt_plugin_helper.hpp" - -class RoIAlignPluginDynamic : public nvinfer1::IPluginV2DynamicExt { +#include "trt_plugin_base.hpp" +namespace mmlab { +class TRTRoIAlign : public TRTPluginBase { public: - RoIAlignPluginDynamic(const std::string &name, int outWidth, int outHeight, - float spatialScale, int sampleRatio, int poolMode, - bool aligned); + TRTRoIAlign(const std::string &name, int outWidth, int outHeight, + float spatialScale, int sampleRatio, int poolMode, bool aligned); - RoIAlignPluginDynamic(const std::string name, const void *data, - size_t length); + TRTRoIAlign(const std::string name, const void *data, size_t length); - RoIAlignPluginDynamic() = delete; + TRTRoIAlign() = delete; // IPluginV2DynamicExt Methods nvinfer1::IPluginV2DynamicExt *clone() const override; @@ -49,60 +47,31 @@ class RoIAlignPluginDynamic : public nvinfer1::IPluginV2DynamicExt { const char *getPluginType() const override; const char *getPluginVersion() const override; int getNbOutputs() const override; - int initialize() override; - void terminate() override; size_t getSerializationSize() const override; void serialize(void *buffer) const override; - void destroy() override; - void setPluginNamespace(const char *pluginNamespace) override; - const char *getPluginNamespace() const override; private: - const std::string mLayerName; - std::string mNamespace; - int mOutWidth; int mOutHeight; float mSpatialScale; int mSampleRatio; int mPoolMode; // 1:avg 0:max bool mAligned; - - protected: - // To prevent compiler warnings. - using nvinfer1::IPluginV2DynamicExt::canBroadcastInputAcrossBatch; - using nvinfer1::IPluginV2DynamicExt::configurePlugin; - using nvinfer1::IPluginV2DynamicExt::enqueue; - using nvinfer1::IPluginV2DynamicExt::getOutputDimensions; - using nvinfer1::IPluginV2DynamicExt::getWorkspaceSize; - using nvinfer1::IPluginV2DynamicExt::isOutputBroadcastAcrossBatch; - using nvinfer1::IPluginV2DynamicExt::supportsFormat; }; -class RoIAlignPluginDynamicCreator : public nvinfer1::IPluginCreator { +class TRTRoIAlignCreator : public TRTPluginCreatorBase { public: - RoIAlignPluginDynamicCreator(); + TRTRoIAlignCreator(); const char *getPluginName() const override; const char *getPluginVersion() const override; - - const nvinfer1::PluginFieldCollection *getFieldNames() override; - nvinfer1::IPluginV2 *createPlugin( const char *name, const nvinfer1::PluginFieldCollection *fc) override; nvinfer1::IPluginV2 *deserializePlugin(const char *name, const void *serialData, size_t serialLength) override; - - void setPluginNamespace(const char *pluginNamespace) override; - - const char *getPluginNamespace() const override; - - private: - nvinfer1::PluginFieldCollection mFC; - std::vector mPluginAttributes; - std::string mNamespace; }; +} // namespace mmlab #endif // TRT_ROI_ALIGN_HPP diff --git a/backend_ops/tensorrt/roi_align/trt_roi_align_kernel.cu b/backend_ops/tensorrt/roi_align/trt_roi_align_kernel.cu index f744c80ec8..fba2b14c20 100644 --- a/backend_ops/tensorrt/roi_align/trt_roi_align_kernel.cu +++ b/backend_ops/tensorrt/roi_align/trt_roi_align_kernel.cu @@ -1,5 +1,98 @@ #include "common_cuda_helper.hpp" -#include "trt_roi_align_kernel.cuh" +#include "float.h" +#include "trt_roi_align_kernel.hpp" + +/*** Forward ***/ +template +__global__ void roi_align_forward_cuda_kernel( + const int nthreads, const T* input, const T* rois, T* output, T* argmax_y, + T* argmax_x, const int pooled_height, const int pooled_width, + const T spatial_scale, const int sampling_ratio, + const int pool_mode, // 0 - max pool, 1 - avg pool + const bool aligned, const int channels, const int height, const int width) { + CUDA_1D_KERNEL_LOOP(index, nthreads) { + // (n, c, ph, pw) is an element in the pooled output + int pw = index % pooled_width; + int ph = (index / pooled_width) % pooled_height; + int c = (index / pooled_width / pooled_height) % channels; + int n = index / pooled_width / pooled_height / channels; + + const T* offset_rois = rois + n * 5; + int roi_batch_ind = offset_rois[0]; + + // Do not using rounding; this implementation detail is critical + T offset = aligned ? (T)0.5 : (T)0.0; + T roi_start_w = offset_rois[1] * spatial_scale - offset; + T roi_start_h = offset_rois[2] * spatial_scale - offset; + T roi_end_w = offset_rois[3] * spatial_scale - offset; + T roi_end_h = offset_rois[4] * spatial_scale - offset; + + T roi_width = roi_end_w - roi_start_w; + T roi_height = roi_end_h - roi_start_h; + if (!aligned) { // for backward-compatibility only + roi_width = max(roi_width, (T)1.); + roi_height = max(roi_height, (T)1.); + } + + T bin_size_h = static_cast(roi_height) / static_cast(pooled_height); + T bin_size_w = static_cast(roi_width) / static_cast(pooled_width); + + const T* offset_input = + input + (roi_batch_ind * channels + c) * height * width; + + // We use roi_bin_grid to sample the grid and mimic integral + int roi_bin_grid_h = + (sampling_ratio > 0) + ? sampling_ratio + : static_cast(ceilf(roi_height / pooled_height)); + int roi_bin_grid_w = + (sampling_ratio > 0) + ? sampling_ratio + : static_cast(ceilf(roi_width / pooled_width)); + + if (pool_mode == 0) { + // We do max pooling inside a bin + T maxval = -FLT_MAX; + T maxidx_y = -1.f, maxidx_x = -1.f; + for (int iy = 0; iy < roi_bin_grid_h; iy++) { + const T y = roi_start_h + ph * bin_size_h + + static_cast(iy + .5f) * bin_size_h / + static_cast(roi_bin_grid_h); + for (int ix = 0; ix < roi_bin_grid_w; ix++) { + const T x = roi_start_w + pw * bin_size_w + + static_cast(ix + .5f) * bin_size_w / + static_cast(roi_bin_grid_w); + T val = bilinear_interpolate(offset_input, height, width, y, x); + if (val > maxval) { + maxval = val; + maxidx_y = y; + maxidx_x = x; + } + } + } + output[index] = maxval; + argmax_y[index] = maxidx_y; + argmax_x[index] = maxidx_x; + } else if (pool_mode == 1) { + // We do average pooling inside a bin + const T count = max(roi_bin_grid_h * roi_bin_grid_w, 1); + T output_val = 0.; + for (int iy = 0; iy < roi_bin_grid_h; iy++) { + const T y = roi_start_h + ph * bin_size_h + + static_cast(iy + .5f) * bin_size_h / + static_cast(roi_bin_grid_h); + for (int ix = 0; ix < roi_bin_grid_w; ix++) { + const T x = roi_start_w + pw * bin_size_w + + static_cast(ix + .5f) * bin_size_w / + static_cast(roi_bin_grid_w); + T val = bilinear_interpolate(offset_input, height, width, y, x); + output_val += val; + } + } + output[index] = output_val / count; + } + } +} template void TRTRoIAlignForwardCUDAKernelLauncher( @@ -15,13 +108,8 @@ void TRTRoIAlignForwardCUDAKernelLauncher( pool_mode, aligned, channels, height, width); } -void TRTRoIAlignForwardCUDAKernelLauncher_float( +template void TRTRoIAlignForwardCUDAKernelLauncher( const float* input, const float* rois, float* output, float* argmax_y, float* argmax_x, int output_size, int channels, int height, int width, int aligned_height, int aligned_width, float spatial_scale, - int sampling_ratio, int pool_mode, bool aligned, cudaStream_t stream) { - TRTRoIAlignForwardCUDAKernelLauncher( - input, rois, output, argmax_y, argmax_x, output_size, channels, height, - width, aligned_height, aligned_width, spatial_scale, sampling_ratio, - pool_mode, aligned, stream); -} + int sampling_ratio, int pool_mode, bool aligned, cudaStream_t stream); diff --git a/backend_ops/tensorrt/roi_align/trt_roi_align_kernel.cuh b/backend_ops/tensorrt/roi_align/trt_roi_align_kernel.cuh deleted file mode 100644 index c6f61d8941..0000000000 --- a/backend_ops/tensorrt/roi_align/trt_roi_align_kernel.cuh +++ /dev/null @@ -1,204 +0,0 @@ -#ifndef ROI_ALIGN_CUDA_KERNEL_CUH -#define ROI_ALIGN_CUDA_KERNEL_CUH - -#include - -#include "common_cuda_helper.hpp" - -/*** Forward ***/ -template -__global__ void roi_align_forward_cuda_kernel( - const int nthreads, const T* input, const T* rois, T* output, T* argmax_y, - T* argmax_x, const int pooled_height, const int pooled_width, - const T spatial_scale, const int sampling_ratio, - const int pool_mode, // 0 - max pool, 1 - avg pool - const bool aligned, const int channels, const int height, const int width) { - CUDA_1D_KERNEL_LOOP(index, nthreads) { - // (n, c, ph, pw) is an element in the pooled output - int pw = index % pooled_width; - int ph = (index / pooled_width) % pooled_height; - int c = (index / pooled_width / pooled_height) % channels; - int n = index / pooled_width / pooled_height / channels; - - const T* offset_rois = rois + n * 5; - int roi_batch_ind = offset_rois[0]; - - // Do not using rounding; this implementation detail is critical - T offset = aligned ? (T)0.5 : (T)0.0; - T roi_start_w = offset_rois[1] * spatial_scale - offset; - T roi_start_h = offset_rois[2] * spatial_scale - offset; - T roi_end_w = offset_rois[3] * spatial_scale - offset; - T roi_end_h = offset_rois[4] * spatial_scale - offset; - - T roi_width = roi_end_w - roi_start_w; - T roi_height = roi_end_h - roi_start_h; - if (!aligned) { // for backward-compatibility only - roi_width = max(roi_width, (T)1.); - roi_height = max(roi_height, (T)1.); - } - - T bin_size_h = static_cast(roi_height) / static_cast(pooled_height); - T bin_size_w = static_cast(roi_width) / static_cast(pooled_width); - - const T* offset_input = - input + (roi_batch_ind * channels + c) * height * width; - - // We use roi_bin_grid to sample the grid and mimic integral - int roi_bin_grid_h = - (sampling_ratio > 0) - ? sampling_ratio - : static_cast(ceilf(roi_height / pooled_height)); - int roi_bin_grid_w = - (sampling_ratio > 0) - ? sampling_ratio - : static_cast(ceilf(roi_width / pooled_width)); - - if (pool_mode == 0) { - // We do max pooling inside a bin - T maxval = -FLT_MAX; - T maxidx_y = -1.f, maxidx_x = -1.f; - for (int iy = 0; iy < roi_bin_grid_h; iy++) { - const T y = roi_start_h + ph * bin_size_h + - static_cast(iy + .5f) * bin_size_h / - static_cast(roi_bin_grid_h); - for (int ix = 0; ix < roi_bin_grid_w; ix++) { - const T x = roi_start_w + pw * bin_size_w + - static_cast(ix + .5f) * bin_size_w / - static_cast(roi_bin_grid_w); - T val = - bilinear_interpolate(offset_input, height, width, y, x, index); - if (val > maxval) { - maxval = val; - maxidx_y = y; - maxidx_x = x; - } - } - } - output[index] = maxval; - argmax_y[index] = maxidx_y; - argmax_x[index] = maxidx_x; - } else if (pool_mode == 1) { - // We do average pooling inside a bin - const T count = max(roi_bin_grid_h * roi_bin_grid_w, 1); - T output_val = 0.; - for (int iy = 0; iy < roi_bin_grid_h; iy++) { - const T y = roi_start_h + ph * bin_size_h + - static_cast(iy + .5f) * bin_size_h / - static_cast(roi_bin_grid_h); - for (int ix = 0; ix < roi_bin_grid_w; ix++) { - const T x = roi_start_w + pw * bin_size_w + - static_cast(ix + .5f) * bin_size_w / - static_cast(roi_bin_grid_w); - T val = - bilinear_interpolate(offset_input, height, width, y, x, index); - output_val += val; - } - } - output[index] = output_val / count; - } - } -} - -/*** Backward ***/ -template -__global__ void roi_align_backward_cuda_kernel( - const int nthreads, const T* grad_output, const T* rois, const T* argmax_y, - const T* argmax_x, T* grad_input, const int pooled_height, - const int pooled_width, const T spatial_scale, const int sampling_ratio, - const int pool_mode, // 0 - max pool, 1 - avg pool - const bool aligned, const int channels, const int height, const int width) { - CUDA_1D_KERNEL_LOOP(index, nthreads) { - // (n, c, ph, pw) is an element in the pooled output - int pw = index % pooled_width; - int ph = (index / pooled_width) % pooled_height; - int c = (index / pooled_width / pooled_height) % channels; - int n = index / pooled_width / pooled_height / channels; - - const T grad_output_this_bin = grad_output[index]; - - const T* offset_rois = rois + n * 5; - int roi_batch_ind = offset_rois[0]; - T* offset_grad_input = - grad_input + ((roi_batch_ind * channels + c) * height * width); - - if (pool_mode == 0) { - T y = argmax_y[index], x = argmax_x[index]; - if (y != -1.f) { - T w1, w2, w3, w4; - int x_low, x_high, y_low, y_high; - bilinear_interpolate_gradient(height, width, y, x, w1, w2, w3, w4, - x_low, x_high, y_low, y_high, index); - - if (x_low >= 0 && x_high >= 0 && y_low >= 0 && y_high >= 0) { - atomicAdd(offset_grad_input + y_low * width + x_low, - grad_output_this_bin * w1); - atomicAdd(offset_grad_input + y_low * width + x_high, - grad_output_this_bin * w2); - atomicAdd(offset_grad_input + y_high * width + x_low, - grad_output_this_bin * w3); - atomicAdd(offset_grad_input + y_high * width + x_high, - grad_output_this_bin * w4); - } - } - } else if (pool_mode == 1) { - // Do not using rounding; this implementation detail is critical - T offset = aligned ? (T)0.5 : (T)0.0; - T roi_start_w = offset_rois[1] * spatial_scale - offset; - T roi_start_h = offset_rois[2] * spatial_scale - offset; - T roi_end_w = offset_rois[3] * spatial_scale - offset; - T roi_end_h = offset_rois[4] * spatial_scale - offset; - - T roi_width = roi_end_w - roi_start_w; - T roi_height = roi_end_h - roi_start_h; - if (!aligned) { // for backward-compatibility only - roi_width = max(roi_width, (T)1.); - roi_height = max(roi_height, (T)1.); - } - - T bin_size_h = static_cast(roi_height) / static_cast(pooled_height); - T bin_size_w = static_cast(roi_width) / static_cast(pooled_width); - - // We use roi_bin_grid to sample the grid and mimic integral - int roi_bin_grid_h = - (sampling_ratio > 0) - ? sampling_ratio - : static_cast(ceilf(roi_height / pooled_height)); - int roi_bin_grid_w = - (sampling_ratio > 0) - ? sampling_ratio - : static_cast(ceilf(roi_width / pooled_width)); - - // We do average (integral) pooling inside a bin - const T count = roi_bin_grid_h * roi_bin_grid_w; // e.g. = 4 - - for (int iy = 0; iy < roi_bin_grid_h; iy++) { - const T y = roi_start_h + ph * bin_size_h + - static_cast(iy + .5f) * bin_size_h / - static_cast(roi_bin_grid_h); - for (int ix = 0; ix < roi_bin_grid_w; ix++) { - const T x = roi_start_w + pw * bin_size_w + - static_cast(ix + .5f) * bin_size_w / - static_cast(roi_bin_grid_w); - - T w1, w2, w3, w4; - int x_low, x_high, y_low, y_high; - bilinear_interpolate_gradient(height, width, y, x, w1, w2, w3, w4, - x_low, x_high, y_low, y_high, index); - - if (x_low >= 0 && x_high >= 0 && y_low >= 0 && y_high >= 0) { - atomicAdd(offset_grad_input + y_low * width + x_low, - grad_output_this_bin * w1 / count); - atomicAdd(offset_grad_input + y_low * width + x_high, - grad_output_this_bin * w2 / count); - atomicAdd(offset_grad_input + y_high * width + x_low, - grad_output_this_bin * w3 / count); - atomicAdd(offset_grad_input + y_high * width + x_high, - grad_output_this_bin * w4 / count); - } - } - } - } - } -} - -#endif // ROI_ALIGN_CUDA_KERNEL_CUH diff --git a/backend_ops/tensorrt/roi_align/trt_roi_align_kernel.hpp b/backend_ops/tensorrt/roi_align/trt_roi_align_kernel.hpp new file mode 100644 index 0000000000..d5a8dd551c --- /dev/null +++ b/backend_ops/tensorrt/roi_align/trt_roi_align_kernel.hpp @@ -0,0 +1,14 @@ +#ifndef ROI_ALIGN_CUDA_KERNEL_HPP +#define ROI_ALIGN_CUDA_KERNEL_HPP + +#include "common_cuda_helper.hpp" + +template +void TRTRoIAlignForwardCUDAKernelLauncher( + const scalar_t* input, const scalar_t* rois, scalar_t* output, + scalar_t* argmax_y, scalar_t* argmax_x, int output_size, int channels, + int height, int width, int aligned_height, int aligned_width, + scalar_t spatial_scale, int sampling_ratio, int pool_mode, bool aligned, + cudaStream_t stream); + +#endif // ROI_ALIGN_CUDA_KERNEL_HPP diff --git a/backend_ops/tensorrt/scatternd/trt_scatternd.cpp b/backend_ops/tensorrt/scatternd/trt_scatternd.cpp index 03e4d657f3..f5c85909f2 100644 --- a/backend_ops/tensorrt/scatternd/trt_scatternd.cpp +++ b/backend_ops/tensorrt/scatternd/trt_scatternd.cpp @@ -8,32 +8,32 @@ #include "trt_scatternd_kernel.hpp" #include "trt_serialize.hpp" +namespace mmlab { namespace { static const char *PLUGIN_VERSION{"1"}; static const char *PLUGIN_NAME{"ScatterND"}; } // namespace -ONNXScatterNDDynamic::ONNXScatterNDDynamic(const std::string &name) - : mLayerName(name) {} +TRTScatterND::TRTScatterND(const std::string &name) : TRTPluginBase(name) {} -ONNXScatterNDDynamic::ONNXScatterNDDynamic(const std::string name, - const void *data, size_t length) - : mLayerName(name) {} +TRTScatterND::TRTScatterND(const std::string name, const void *data, + size_t length) + : TRTPluginBase(name) {} -nvinfer1::IPluginV2DynamicExt *ONNXScatterNDDynamic::clone() const { - ONNXScatterNDDynamic *plugin = new ONNXScatterNDDynamic(mLayerName); +nvinfer1::IPluginV2DynamicExt *TRTScatterND::clone() const { + TRTScatterND *plugin = new TRTScatterND(mLayerName); plugin->setPluginNamespace(getPluginNamespace()); return plugin; } -nvinfer1::DimsExprs ONNXScatterNDDynamic::getOutputDimensions( +nvinfer1::DimsExprs TRTScatterND::getOutputDimensions( int outputIndex, const nvinfer1::DimsExprs *inputs, int nbInputs, nvinfer1::IExprBuilder &exprBuilder) { return inputs[0]; } -bool ONNXScatterNDDynamic::supportsFormatCombination( +bool TRTScatterND::supportsFormatCombination( int pos, const nvinfer1::PluginTensorDesc *inOut, int nbInputs, int nbOutputs) { if (pos < nbInputs) { @@ -68,21 +68,21 @@ bool ONNXScatterNDDynamic::supportsFormatCombination( return true; } -void ONNXScatterNDDynamic::configurePlugin( +void TRTScatterND::configurePlugin( const nvinfer1::DynamicPluginTensorDesc *inputs, int nbInputs, const nvinfer1::DynamicPluginTensorDesc *outputs, int nbOutputs) {} -size_t ONNXScatterNDDynamic::getWorkspaceSize( - const nvinfer1::PluginTensorDesc *inputs, int nbInputs, - const nvinfer1::PluginTensorDesc *outputs, int nbOutputs) const { +size_t TRTScatterND::getWorkspaceSize(const nvinfer1::PluginTensorDesc *inputs, + int nbInputs, + const nvinfer1::PluginTensorDesc *outputs, + int nbOutputs) const { return 0; } -int ONNXScatterNDDynamic::enqueue(const nvinfer1::PluginTensorDesc *inputDesc, - const nvinfer1::PluginTensorDesc *outputDesc, - const void *const *inputs, - void *const *outputs, void *workSpace, - cudaStream_t stream) { +int TRTScatterND::enqueue(const nvinfer1::PluginTensorDesc *inputDesc, + const nvinfer1::PluginTensorDesc *outputDesc, + const void *const *inputs, void *const *outputs, + void *workSpace, cudaStream_t stream) { const int *dims = &(inputDesc[0].dims.d[0]); const int *indices_dims = &(inputDesc[1].dims.d[0]); int nbDims = inputDesc[0].dims.nbDims; @@ -114,82 +114,47 @@ int ONNXScatterNDDynamic::enqueue(const nvinfer1::PluginTensorDesc *inputDesc, return 0; } -nvinfer1::DataType ONNXScatterNDDynamic::getOutputDataType( +nvinfer1::DataType TRTScatterND::getOutputDataType( int index, const nvinfer1::DataType *inputTypes, int nbInputs) const { return inputTypes[0]; } // IPluginV2 Methods -const char *ONNXScatterNDDynamic::getPluginType() const { return PLUGIN_NAME; } +const char *TRTScatterND::getPluginType() const { return PLUGIN_NAME; } -const char *ONNXScatterNDDynamic::getPluginVersion() const { - return PLUGIN_VERSION; -} - -int ONNXScatterNDDynamic::getNbOutputs() const { return 1; } - -int ONNXScatterNDDynamic::initialize() { return 0; } - -void ONNXScatterNDDynamic::terminate() {} - -size_t ONNXScatterNDDynamic::getSerializationSize() const { return 0; } - -void ONNXScatterNDDynamic::serialize(void *buffer) const {} - -void ONNXScatterNDDynamic::destroy() { - // This gets called when the network containing plugin is destroyed - delete this; -} +const char *TRTScatterND::getPluginVersion() const { return PLUGIN_VERSION; } -void ONNXScatterNDDynamic::setPluginNamespace(const char *libNamespace) { - mNamespace = libNamespace; -} +int TRTScatterND::getNbOutputs() const { return 1; } -const char *ONNXScatterNDDynamic::getPluginNamespace() const { - return mNamespace.c_str(); -} +size_t TRTScatterND::getSerializationSize() const { return 0; } -////////////////////// creator ///////////////////////////// +void TRTScatterND::serialize(void *buffer) const {} -ONNXScatterNDDynamicCreator::ONNXScatterNDDynamicCreator() { +TRTScatterNDCreator::TRTScatterNDCreator() { mPluginAttributes.clear(); mFC.nbFields = mPluginAttributes.size(); mFC.fields = mPluginAttributes.data(); } -const char *ONNXScatterNDDynamicCreator::getPluginName() const { - return PLUGIN_NAME; -} +const char *TRTScatterNDCreator::getPluginName() const { return PLUGIN_NAME; } -const char *ONNXScatterNDDynamicCreator::getPluginVersion() const { +const char *TRTScatterNDCreator::getPluginVersion() const { return PLUGIN_VERSION; } -const nvinfer1::PluginFieldCollection * -ONNXScatterNDDynamicCreator::getFieldNames() { - return &mFC; -} - -nvinfer1::IPluginV2 *ONNXScatterNDDynamicCreator::createPlugin( +nvinfer1::IPluginV2 *TRTScatterNDCreator::createPlugin( const char *name, const nvinfer1::PluginFieldCollection *fc) { - ONNXScatterNDDynamic *plugin = new ONNXScatterNDDynamic(name); + TRTScatterND *plugin = new TRTScatterND(name); plugin->setPluginNamespace(getPluginNamespace()); return plugin; } -nvinfer1::IPluginV2 *ONNXScatterNDDynamicCreator::deserializePlugin( +nvinfer1::IPluginV2 *TRTScatterNDCreator::deserializePlugin( const char *name, const void *serialData, size_t serialLength) { - auto plugin = new ONNXScatterNDDynamic(name, serialData, serialLength); + auto plugin = new TRTScatterND(name, serialData, serialLength); plugin->setPluginNamespace(getPluginNamespace()); return plugin; } -void ONNXScatterNDDynamicCreator::setPluginNamespace(const char *libNamespace) { - mNamespace = libNamespace; -} - -const char *ONNXScatterNDDynamicCreator::getPluginNamespace() const { - return mNamespace.c_str(); -} - -REGISTER_TENSORRT_PLUGIN(ONNXScatterNDDynamicCreator); +REGISTER_TENSORRT_PLUGIN(TRTScatterNDCreator); +} // namespace mmlab diff --git a/backend_ops/tensorrt/scatternd/trt_scatternd.hpp b/backend_ops/tensorrt/scatternd/trt_scatternd.hpp index f64662c0f0..a5e08ce835 100644 --- a/backend_ops/tensorrt/scatternd/trt_scatternd.hpp +++ b/backend_ops/tensorrt/scatternd/trt_scatternd.hpp @@ -6,15 +6,16 @@ #include #include -#include "trt_plugin_helper.hpp" +#include "trt_plugin_base.hpp" -class ONNXScatterNDDynamic : public nvinfer1::IPluginV2DynamicExt { +namespace mmlab { +class TRTScatterND : public TRTPluginBase { public: - ONNXScatterNDDynamic(const std::string &name); + TRTScatterND(const std::string &name); - ONNXScatterNDDynamic(const std::string name, const void *data, size_t length); + TRTScatterND(const std::string name, const void *data, size_t length); - ONNXScatterNDDynamic() = delete; + TRTScatterND() = delete; // IPluginV2DynamicExt Methods nvinfer1::IPluginV2DynamicExt *clone() const override; @@ -46,53 +47,23 @@ class ONNXScatterNDDynamic : public nvinfer1::IPluginV2DynamicExt { const char *getPluginType() const override; const char *getPluginVersion() const override; int getNbOutputs() const override; - int initialize() override; - void terminate() override; size_t getSerializationSize() const override; void serialize(void *buffer) const override; - void destroy() override; - void setPluginNamespace(const char *pluginNamespace) override; - const char *getPluginNamespace() const override; - - private: - const std::string mLayerName; - std::string mNamespace; - - protected: - // To prevent compiler warnings. - using nvinfer1::IPluginV2DynamicExt::canBroadcastInputAcrossBatch; - using nvinfer1::IPluginV2DynamicExt::configurePlugin; - using nvinfer1::IPluginV2DynamicExt::enqueue; - using nvinfer1::IPluginV2DynamicExt::getOutputDimensions; - using nvinfer1::IPluginV2DynamicExt::getWorkspaceSize; - using nvinfer1::IPluginV2DynamicExt::isOutputBroadcastAcrossBatch; - using nvinfer1::IPluginV2DynamicExt::supportsFormat; }; -class ONNXScatterNDDynamicCreator : public nvinfer1::IPluginCreator { +class TRTScatterNDCreator : public TRTPluginCreatorBase { public: - ONNXScatterNDDynamicCreator(); + TRTScatterNDCreator(); const char *getPluginName() const override; const char *getPluginVersion() const override; - - const nvinfer1::PluginFieldCollection *getFieldNames() override; - nvinfer1::IPluginV2 *createPlugin( const char *name, const nvinfer1::PluginFieldCollection *fc) override; nvinfer1::IPluginV2 *deserializePlugin(const char *name, const void *serialData, size_t serialLength) override; - - void setPluginNamespace(const char *pluginNamespace) override; - - const char *getPluginNamespace() const override; - - private: - nvinfer1::PluginFieldCollection mFC; - std::vector mPluginAttributes; - std::string mNamespace; }; +} // namespace mmlab #endif // TRT_SCATTERND_HPP diff --git a/backend_ops/tensorrt/scatternd/trt_scatternd_kernel.cu b/backend_ops/tensorrt/scatternd/trt_scatternd_kernel.cu index d7b59d0c10..2adda61485 100644 --- a/backend_ops/tensorrt/scatternd/trt_scatternd_kernel.cu +++ b/backend_ops/tensorrt/scatternd/trt_scatternd_kernel.cu @@ -3,7 +3,6 @@ #include #include "common_cuda_helper.hpp" -#include "trt_cuda_helper.cuh" #include "trt_plugin_helper.hpp" using mmlab::TensorDesc;