From 3fa94f442464e55e68a0eaeb30782f57629aed4d Mon Sep 17 00:00:00 2001
From: "q.yao" <streetyao@live.com>
Date: Mon, 12 Jul 2021 10:01:52 +0800
Subject: [PATCH] [Feature] better tensorrt cpp code (#11)

* better tensorrt cpp code

* fix end of file
---
 .../{gridSample.cpp => grid_sample.cpp}       |   0
 .../tensorrt/batched_nms/trt_batched_nms.cpp  |  99 ++++-----
 .../tensorrt/batched_nms/trt_batched_nms.hpp  |  53 +----
 ...ference.cpp => trt_batched_nms_kernel.cpp} |   3 +-
 .../batched_nms/trt_batched_nms_kernel.hpp    |  16 ++
 .../tensorrt/common/common_cuda_helper.hpp    | 109 ++++------
 .../tensorrt/common/trt_cuda_helper.cuh       |  30 ---
 .../tensorrt/common/trt_plugin_base.hpp       |  55 +++++
 .../tensorrt/common/trt_plugin_helper.hpp     |   1 +
 backend_ops/tensorrt/common/trt_serialize.hpp |   4 -
 .../tensorrt/common_impl/trt_cuda_helper.cu   |   1 -
 .../trt_multi_level_roi_align.cpp             | 109 ++++------
 .../trt_multi_level_roi_align.hpp             |  58 ++---
 .../trt_multi_level_roi_align_kernel.cu       |  48 -----
 backend_ops/tensorrt/nms/trt_nms.cpp          | 152 +++++--------
 backend_ops/tensorrt/nms/trt_nms.hpp          |  51 +----
 backend_ops/tensorrt/nms/trt_nms_kernel.cu    |  80 ++++++-
 backend_ops/tensorrt/nms/trt_nms_kernel.cuh   |  67 ------
 backend_ops/tensorrt/nms/trt_nms_kernel.hpp   |  18 ++
 .../tensorrt/roi_align/trt_roi_align.cpp      | 140 +++++-------
 .../tensorrt/roi_align/trt_roi_align.hpp      |  51 +----
 .../roi_align/trt_roi_align_kernel.cu         | 104 ++++++++-
 .../roi_align/trt_roi_align_kernel.cuh        | 204 ------------------
 .../roi_align/trt_roi_align_kernel.hpp        |  14 ++
 .../tensorrt/scatternd/trt_scatternd.cpp      | 101 +++------
 .../tensorrt/scatternd/trt_scatternd.hpp      |  47 +---
 .../scatternd/trt_scatternd_kernel.cu         |   1 -
 27 files changed, 573 insertions(+), 1043 deletions(-)
 rename backend_ops/onnxruntime/grid_sample/{gridSample.cpp => grid_sample.cpp} (100%)
 rename backend_ops/tensorrt/batched_nms/{batchedNMSInference.cpp => trt_batched_nms_kernel.cpp} (99%)
 create mode 100644 backend_ops/tensorrt/batched_nms/trt_batched_nms_kernel.hpp
 delete mode 100644 backend_ops/tensorrt/common/trt_cuda_helper.cuh
 create mode 100644 backend_ops/tensorrt/common/trt_plugin_base.hpp
 delete mode 100644 backend_ops/tensorrt/nms/trt_nms_kernel.cuh
 create mode 100644 backend_ops/tensorrt/nms/trt_nms_kernel.hpp
 delete mode 100644 backend_ops/tensorrt/roi_align/trt_roi_align_kernel.cuh
 create mode 100644 backend_ops/tensorrt/roi_align/trt_roi_align_kernel.hpp
diff --git a/backend_ops/onnxruntime/grid_sample/gridSample.cpp b/backend_ops/onnxruntime/grid_sample/grid_sample.cpp
similarity index 100%
rename from backend_ops/onnxruntime/grid_sample/gridSample.cpp
rename to backend_ops/onnxruntime/grid_sample/grid_sample.cpp
diff --git a/backend_ops/tensorrt/batched_nms/trt_batched_nms.cpp b/backend_ops/tensorrt/batched_nms/trt_batched_nms.cpp
index bdde30d6de..e7f39c5942 100644
--- a/backend_ops/tensorrt/batched_nms/trt_batched_nms.cpp
+++ b/backend_ops/tensorrt/batched_nms/trt_batched_nms.cpp
@@ -6,8 +6,10 @@
 #include <cstring>
 
 #include "kernel.h"
+#include "trt_batched_nms_kernel.hpp"
 #include "trt_serialize.hpp"
 
+namespace mmlab {
 using namespace nvinfer1;
 using nvinfer1::plugin::NMSParameters;
 
@@ -16,11 +18,12 @@ static const char* NMS_PLUGIN_VERSION{"1"};
 static const char* NMS_PLUGIN_NAME{"TRTBatchedNMS"};
 }  // namespace
 
-TRTBatchedNMSPluginDynamic::TRTBatchedNMSPluginDynamic(NMSParameters params)
-    : param(params) {}
+TRTBatchedNMS::TRTBatchedNMS(const std::string& name, NMSParameters params)
+    : TRTPluginBase(name), param(params) {}
 
-TRTBatchedNMSPluginDynamic::TRTBatchedNMSPluginDynamic(const void* data,
-                                                       size_t length) {
+TRTBatchedNMS::TRTBatchedNMS(const std::string& name, const void* data,
+                             size_t length)
+    : TRTPluginBase(name) {
   deserialize_value(&data, &length, &param);
   deserialize_value(&data, &length, &boxesSize);
   deserialize_value(&data, &length, &scoresSize);
@@ -28,13 +31,9 @@ TRTBatchedNMSPluginDynamic::TRTBatchedNMSPluginDynamic(const void* data,
   deserialize_value(&data, &length, &mClipBoxes);
 }
 
-int TRTBatchedNMSPluginDynamic::getNbOutputs() const { return 2; }
+int TRTBatchedNMS::getNbOutputs() const { return 2; }
 
-int TRTBatchedNMSPluginDynamic::initialize() { return STATUS_SUCCESS; }
-
-void TRTBatchedNMSPluginDynamic::terminate() {}
-
-nvinfer1::DimsExprs TRTBatchedNMSPluginDynamic::getOutputDimensions(
+nvinfer1::DimsExprs TRTBatchedNMS::getOutputDimensions(
     int outputIndex, const nvinfer1::DimsExprs* inputs, int nbInputs,
     nvinfer1::IExprBuilder& exprBuilder) {
   ASSERT(nbInputs == 2);
@@ -60,7 +59,7 @@ nvinfer1::DimsExprs TRTBatchedNMSPluginDynamic::getOutputDimensions(
   return ret;
 }
 
-size_t TRTBatchedNMSPluginDynamic::getWorkspaceSize(
+size_t TRTBatchedNMS::getWorkspaceSize(
     const nvinfer1::PluginTensorDesc* inputs, int nbInputs,
     const nvinfer1::PluginTensorDesc* outputs, int nbOutputs) const {
   size_t batch_size = inputs[0].dims.d[0];
@@ -75,10 +74,10 @@ size_t TRTBatchedNMSPluginDynamic::getWorkspaceSize(
       num_priors, topk, DataType::kFLOAT, DataType::kFLOAT);
 }
 
-int TRTBatchedNMSPluginDynamic::enqueue(
-    const nvinfer1::PluginTensorDesc* inputDesc,
-    const nvinfer1::PluginTensorDesc* outputDesc, const void* const* inputs,
-    void* const* outputs, void* workSpace, cudaStream_t stream) {
+int TRTBatchedNMS::enqueue(const nvinfer1::PluginTensorDesc* inputDesc,
+                           const nvinfer1::PluginTensorDesc* outputDesc,
+                           const void* const* inputs, void* const* outputs,
+                           void* workSpace, cudaStream_t stream) {
   const void* const locData = inputs[0];
   const void* const confData = inputs[1];
 
@@ -103,12 +102,12 @@ int TRTBatchedNMSPluginDynamic::enqueue(
   return 0;
 }
 
-size_t TRTBatchedNMSPluginDynamic::getSerializationSize() const {
+size_t TRTBatchedNMS::getSerializationSize() const {
   // NMSParameters, boxesSize,scoresSize,numPriors
   return sizeof(NMSParameters) + sizeof(int) * 3 + sizeof(bool);
 }
 
-void TRTBatchedNMSPluginDynamic::serialize(void* buffer) const {
+void TRTBatchedNMS::serialize(void* buffer) const {
   serialize_value(&buffer, param);
   serialize_value(&buffer, boxesSize);
   serialize_value(&buffer, scoresSize);
@@ -116,13 +115,13 @@ void TRTBatchedNMSPluginDynamic::serialize(void* buffer) const {
   serialize_value(&buffer, mClipBoxes);
 }
 
-void TRTBatchedNMSPluginDynamic::configurePlugin(
+void TRTBatchedNMS::configurePlugin(
     const nvinfer1::DynamicPluginTensorDesc* inputs, int nbInputs,
     const nvinfer1::DynamicPluginTensorDesc* outputs, int nbOutputs) {
   // Validate input arguments
 }
 
-bool TRTBatchedNMSPluginDynamic::supportsFormatCombination(
+bool TRTBatchedNMS::supportsFormatCombination(
     int pos, const nvinfer1::PluginTensorDesc* inOut, int nbInputs,
     int nbOutputs) {
   if (pos == 3) {
@@ -133,18 +132,14 @@ bool TRTBatchedNMSPluginDynamic::supportsFormatCombination(
          inOut[pos].format == nvinfer1::TensorFormat::kLINEAR;
 }
 
-const char* TRTBatchedNMSPluginDynamic::getPluginType() const {
-  return NMS_PLUGIN_NAME;
-}
+const char* TRTBatchedNMS::getPluginType() const { return NMS_PLUGIN_NAME; }
 
-const char* TRTBatchedNMSPluginDynamic::getPluginVersion() const {
+const char* TRTBatchedNMS::getPluginVersion() const {
   return NMS_PLUGIN_VERSION;
 }
 
-void TRTBatchedNMSPluginDynamic::destroy() { delete this; }
-
-IPluginV2DynamicExt* TRTBatchedNMSPluginDynamic::clone() const {
-  auto* plugin = new TRTBatchedNMSPluginDynamic(param);
+IPluginV2DynamicExt* TRTBatchedNMS::clone() const {
+  auto* plugin = new TRTBatchedNMS(mLayerName, param);
   plugin->boxesSize = boxesSize;
   plugin->scoresSize = scoresSize;
   plugin->numPriors = numPriors;
@@ -153,16 +148,7 @@ IPluginV2DynamicExt* TRTBatchedNMSPluginDynamic::clone() const {
   return plugin;
 }
 
-void TRTBatchedNMSPluginDynamic::setPluginNamespace(
-    const char* pluginNamespace) {
-  mNamespace = pluginNamespace;
-}
-
-const char* TRTBatchedNMSPluginDynamic::getPluginNamespace() const {
-  return mNamespace.c_str();
-}
-
-nvinfer1::DataType TRTBatchedNMSPluginDynamic::getOutputDataType(
+nvinfer1::DataType TRTBatchedNMS::getOutputDataType(
     int index, const nvinfer1::DataType* inputTypes, int nbInputs) const {
   ASSERT(index >= 0 && index < this->getNbOutputs());
   if (index == 1) {
@@ -171,10 +157,9 @@ nvinfer1::DataType TRTBatchedNMSPluginDynamic::getOutputDataType(
   return inputTypes[0];
 }
 
-void TRTBatchedNMSPluginDynamic::setClipParam(bool clip) { mClipBoxes = clip; }
+void TRTBatchedNMS::setClipParam(bool clip) { mClipBoxes = clip; }
 
-TRTBatchedNMSPluginDynamicCreator::TRTBatchedNMSPluginDynamicCreator()
-    : params{} {
+TRTBatchedNMSCreator::TRTBatchedNMSCreator() {
   mPluginAttributes.emplace_back(
       PluginField("background_label_id", nullptr, PluginFieldType::kINT32, 1));
   mPluginAttributes.emplace_back(
@@ -196,23 +181,19 @@ TRTBatchedNMSPluginDynamicCreator::TRTBatchedNMSPluginDynamicCreator()
   mFC.fields = mPluginAttributes.data();
 }
 
-const char* TRTBatchedNMSPluginDynamicCreator::getPluginName() const {
+const char* TRTBatchedNMSCreator::getPluginName() const {
   return NMS_PLUGIN_NAME;
 }
 
-const char* TRTBatchedNMSPluginDynamicCreator::getPluginVersion() const {
+const char* TRTBatchedNMSCreator::getPluginVersion() const {
   return NMS_PLUGIN_VERSION;
 }
 
-const PluginFieldCollection*
-TRTBatchedNMSPluginDynamicCreator::getFieldNames() {
-  return &mFC;
-}
-
-IPluginV2Ext* TRTBatchedNMSPluginDynamicCreator::createPlugin(
+IPluginV2Ext* TRTBatchedNMSCreator::createPlugin(
     const char* name, const PluginFieldCollection* fc) {
   const PluginField* fields = fc->fields;
   bool clipBoxes = true;
+  nvinfer1::plugin::NMSParameters params{};
 
   for (int i = 0; i < fc->nbFields; ++i) {
     const char* attrName = fields[i].name;
@@ -241,29 +222,21 @@ IPluginV2Ext* TRTBatchedNMSPluginDynamicCreator::createPlugin(
     }
   }
 
-  TRTBatchedNMSPluginDynamic* plugin = new TRTBatchedNMSPluginDynamic(params);
+  TRTBatchedNMS* plugin = new TRTBatchedNMS(name, params);
   plugin->setClipParam(clipBoxes);
   plugin->setPluginNamespace(mNamespace.c_str());
   return plugin;
 }
 
-IPluginV2Ext* TRTBatchedNMSPluginDynamicCreator::deserializePlugin(
-    const char* name, const void* serialData, size_t serialLength) {
+IPluginV2Ext* TRTBatchedNMSCreator::deserializePlugin(const char* name,
+                                                      const void* serialData,
+                                                      size_t serialLength) {
   // This object will be deleted when the network is destroyed, which will
   // call NMS::destroy()
-  TRTBatchedNMSPluginDynamic* plugin =
-      new TRTBatchedNMSPluginDynamic(serialData, serialLength);
+  TRTBatchedNMS* plugin = new TRTBatchedNMS(name, serialData, serialLength);
   plugin->setPluginNamespace(mNamespace.c_str());
   return plugin;
 }
 
-void TRTBatchedNMSPluginDynamicCreator::setPluginNamespace(
-    const char* libNamespace) {
-  mNamespace = libNamespace;
-}
-
-const char* TRTBatchedNMSPluginDynamicCreator::getPluginNamespace() const {
-  return mNamespace.c_str();
-}
-
-REGISTER_TENSORRT_PLUGIN(TRTBatchedNMSPluginDynamicCreator);
+REGISTER_TENSORRT_PLUGIN(TRTBatchedNMSCreator);
+}  // namespace mmlab
diff --git a/backend_ops/tensorrt/batched_nms/trt_batched_nms.hpp b/backend_ops/tensorrt/batched_nms/trt_batched_nms.hpp
index 015d62725f..cb374e7c31 100644
--- a/backend_ops/tensorrt/batched_nms/trt_batched_nms.hpp
+++ b/backend_ops/tensorrt/batched_nms/trt_batched_nms.hpp
@@ -5,15 +5,15 @@
 #include <string>
 #include <vector>
 
-#include "trt_plugin_helper.hpp"
-
-class TRTBatchedNMSPluginDynamic : public nvinfer1::IPluginV2DynamicExt {
+#include "trt_plugin_base.hpp"
+namespace mmlab {
+class TRTBatchedNMS : public TRTPluginBase {
  public:
-  TRTBatchedNMSPluginDynamic(nvinfer1::plugin::NMSParameters param);
+  TRTBatchedNMS(const std::string& name, nvinfer1::plugin::NMSParameters param);
 
-  TRTBatchedNMSPluginDynamic(const void* data, size_t length);
+  TRTBatchedNMS(const std::string& name, const void* data, size_t length);
 
-  ~TRTBatchedNMSPluginDynamic() override = default;
+  ~TRTBatchedNMS() override = default;
 
   int getNbOutputs() const override;
 
@@ -21,10 +21,6 @@ class TRTBatchedNMSPluginDynamic : public nvinfer1::IPluginV2DynamicExt {
       int outputIndex, const nvinfer1::DimsExprs* inputs, int nbInputs,
       nvinfer1::IExprBuilder& exprBuilder) override;
 
-  int initialize() override;
-
-  void terminate() override;
-
   size_t getWorkspaceSize(const nvinfer1::PluginTensorDesc* inputs,
                           int nbInputs,
                           const nvinfer1::PluginTensorDesc* outputs,
@@ -52,18 +48,12 @@ class TRTBatchedNMSPluginDynamic : public nvinfer1::IPluginV2DynamicExt {
 
   const char* getPluginVersion() const override;
 
-  void destroy() override;
-
   nvinfer1::IPluginV2DynamicExt* clone() const override;
 
   nvinfer1::DataType getOutputDataType(int index,
                                        const nvinfer1::DataType* inputType,
                                        int nbInputs) const override;
 
-  void setPluginNamespace(const char* libNamespace) override;
-
-  const char* getPluginNamespace() const override;
-
   void setClipParam(bool clip);
 
  private:
@@ -71,48 +61,25 @@ class TRTBatchedNMSPluginDynamic : public nvinfer1::IPluginV2DynamicExt {
   int boxesSize{};
   int scoresSize{};
   int numPriors{};
-  std::string mNamespace;
   bool mClipBoxes{};
-
- protected:
-  // To prevent compiler warnings.
-  using nvinfer1::IPluginV2DynamicExt::canBroadcastInputAcrossBatch;
-  using nvinfer1::IPluginV2DynamicExt::configurePlugin;
-  using nvinfer1::IPluginV2DynamicExt::enqueue;
-  using nvinfer1::IPluginV2DynamicExt::getOutputDimensions;
-  using nvinfer1::IPluginV2DynamicExt::getWorkspaceSize;
-  using nvinfer1::IPluginV2DynamicExt::isOutputBroadcastAcrossBatch;
-  using nvinfer1::IPluginV2DynamicExt::supportsFormat;
 };
 
-class TRTBatchedNMSPluginDynamicCreator : public nvinfer1::IPluginCreator {
+class TRTBatchedNMSCreator : public TRTPluginCreatorBase {
  public:
-  TRTBatchedNMSPluginDynamicCreator();
+  TRTBatchedNMSCreator();
 
-  ~TRTBatchedNMSPluginDynamicCreator() override = default;
+  ~TRTBatchedNMSCreator() override = default;
 
   const char* getPluginName() const override;
 
   const char* getPluginVersion() const override;
 
-  const nvinfer1::PluginFieldCollection* getFieldNames() override;
-
   nvinfer1::IPluginV2Ext* createPlugin(
       const char* name, const nvinfer1::PluginFieldCollection* fc) override;
 
   nvinfer1::IPluginV2Ext* deserializePlugin(const char* name,
                                             const void* serialData,
                                             size_t serialLength) override;
-
-  void setPluginNamespace(const char* libNamespace) override;
-
-  const char* getPluginNamespace() const override;
-
- private:
-  nvinfer1::PluginFieldCollection mFC;
-  nvinfer1::plugin::NMSParameters params;
-  std::vector<nvinfer1::PluginField> mPluginAttributes;
-  std::string mNamespace;
 };
-
+}  // namespace mmlab
 #endif  // TRT_BATCHED_NMS_PLUGIN_CUSTOM_H
diff --git a/backend_ops/tensorrt/batched_nms/batchedNMSInference.cpp b/backend_ops/tensorrt/batched_nms/trt_batched_nms_kernel.cpp
similarity index 99%
rename from backend_ops/tensorrt/batched_nms/batchedNMSInference.cpp
rename to backend_ops/tensorrt/batched_nms/trt_batched_nms_kernel.cpp
index bbdc42626f..0be033b437 100644
--- a/backend_ops/tensorrt/batched_nms/batchedNMSInference.cpp
+++ b/backend_ops/tensorrt/batched_nms/trt_batched_nms_kernel.cpp
@@ -1,7 +1,6 @@
 // modify from
 // https://github.com/NVIDIA/TensorRT/tree/master/plugin/batchedNMSPlugin
-#include "cuda_runtime_api.h"
-#include "kernel.h"
+#include "trt_batched_nms_kernel.hpp"
 
 pluginStatus_t nmsInference(
     cudaStream_t stream, const int N, const int perBatchBoxesSize,
diff --git a/backend_ops/tensorrt/batched_nms/trt_batched_nms_kernel.hpp b/backend_ops/tensorrt/batched_nms/trt_batched_nms_kernel.hpp
new file mode 100644
index 0000000000..133a212743
--- /dev/null
+++ b/backend_ops/tensorrt/batched_nms/trt_batched_nms_kernel.hpp
@@ -0,0 +1,16 @@
+#ifndef TRT_BATCHED_NMS_KERNEL_HPP
+#define TRT_BATCHED_NMS_KERNEL_HPP
+#include "cuda_runtime_api.h"
+#include "kernel.h"
+
+pluginStatus_t nmsInference(
+    cudaStream_t stream, const int N, const int perBatchBoxesSize,
+    const int perBatchScoresSize, const bool shareLocation,
+    const int backgroundLabelId, const int numPredsPerClass,
+    const int numClasses, const int topK, const int keepTopK,
+    const float scoreThreshold, const float iouThreshold,
+    const DataType DT_BBOX, const void* locData, const DataType DT_SCORE,
+    const void* confData, void* nmsedDets, void* nmsedLabels, void* workspace,
+    bool isNormalized, bool confSigmoid, bool clipBoxes);
+
+#endif
diff --git a/backend_ops/tensorrt/common/common_cuda_helper.hpp b/backend_ops/tensorrt/common/common_cuda_helper.hpp
index a9ab6e82f1..be21d7fccf 100644
--- a/backend_ops/tensorrt/common/common_cuda_helper.hpp
+++ b/backend_ops/tensorrt/common/common_cuda_helper.hpp
@@ -3,22 +3,49 @@
 
 #include <cuda.h>
 
+#include <algorithm>
+
 #define CUDA_1D_KERNEL_LOOP(i, n)                              \
   for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \
        i += blockDim.x * gridDim.x)
 
 #define THREADS_PER_BLOCK 512
 
+#define DIVUP(m, n) ((m) / (n) + ((m) % (n) > 0))
 inline int GET_BLOCKS(const int N) {
-  int optimal_block_num = (N + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
+  int optimal_block_num = DIVUP(N, THREADS_PER_BLOCK);
   int max_block_num = 4096;
-  return min(optimal_block_num, max_block_num);
+  return std::min(optimal_block_num, max_block_num);
 }
 
-template <typename T>
-__device__ T bilinear_interpolate(const T* input, const int height,
-                                  const int width, T y, T x,
-                                  const int index /* index for debug only*/) {
+#define cudaCheckError()                                       \
+  {                                                            \
+    cudaError_t e = cudaGetLastError();                        \
+    if (e != cudaSuccess) {                                    \
+      printf("Cuda failure %s:%d: '%s'\n", __FILE__, __LINE__, \
+             cudaGetErrorString(e));                           \
+      exit(0);                                                 \
+    }                                                          \
+  }
+
+/**
+ * Returns a view of the original tensor with its dimensions permuted.
+ *
+ * @param[out] dst pointer to the destination tensor
+ * @param[in] src pointer to the source tensor
+ * @param[in] src_size shape of the src tensor
+ * @param[in] permute The desired ordering of dimensions
+ * @param[in] src_dim dim of src tensor
+ * @param[in] stream cuda stream handle
+ */
+template <class scalar_t>
+void memcpyPermute(scalar_t *dst, const scalar_t *src, int *src_size,
+                   int *permute, int src_dim, cudaStream_t stream = 0);
+
+template <typename scalar_t>
+__device__ scalar_t bilinear_interpolate(const scalar_t *input,
+                                         const int height, const int width,
+                                         scalar_t y, scalar_t x) {
   // deal with cases that inverse elements are out of feature map boundary
   if (y < -1.0 || y > height || x < -1.0 || x > width) return 0;
 
@@ -32,79 +59,31 @@ __device__ T bilinear_interpolate(const T* input, const int height,
 
   if (y_low >= height - 1) {
     y_high = y_low = height - 1;
-    y = (T)y_low;
+    y = (scalar_t)y_low;
   } else {
     y_high = y_low + 1;
   }
 
   if (x_low >= width - 1) {
     x_high = x_low = width - 1;
-    x = (T)x_low;
+    x = (scalar_t)x_low;
   } else {
     x_high = x_low + 1;
   }
 
-  T ly = y - y_low;
-  T lx = x - x_low;
-  T hy = 1. - ly, hx = 1. - lx;
+  scalar_t ly = y - y_low;
+  scalar_t lx = x - x_low;
+  scalar_t hy = 1. - ly, hx = 1. - lx;
   // do bilinear interpolation
-  T v1 = input[y_low * width + x_low];
-  T v2 = input[y_low * width + x_high];
-  T v3 = input[y_high * width + x_low];
-  T v4 = input[y_high * width + x_high];
-  T w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
+  scalar_t v1 = input[y_low * width + x_low];
+  scalar_t v2 = input[y_low * width + x_high];
+  scalar_t v3 = input[y_high * width + x_low];
+  scalar_t v4 = input[y_high * width + x_high];
+  scalar_t w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
 
-  T val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
+  scalar_t val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
 
   return val;
 }
 
-template <typename T>
-__device__ void bilinear_interpolate_gradient(
-    const int height, const int width, T y, T x, T& w1, T& w2, T& w3, T& w4,
-    int& x_low, int& x_high, int& y_low, int& y_high,
-    const int index /* index for debug only*/) {
-  // deal with cases that inverse elements are out of feature map boundary
-  if (y < -1.0 || y > height || x < -1.0 || x > width) {
-    // empty
-    w1 = w2 = w3 = w4 = 0.;
-    x_low = x_high = y_low = y_high = -1;
-    return;
-  }
-
-  if (y <= 0) y = 0;
-  if (x <= 0) x = 0;
-
-  y_low = (int)y;
-  x_low = (int)x;
-
-  if (y_low >= height - 1) {
-    y_high = y_low = height - 1;
-    y = (T)y_low;
-  } else {
-    y_high = y_low + 1;
-  }
-
-  if (x_low >= width - 1) {
-    x_high = x_low = width - 1;
-    x = (T)x_low;
-  } else {
-    x_high = x_low + 1;
-  }
-
-  T ly = y - y_low;
-  T lx = x - x_low;
-  T hy = 1. - ly, hx = 1. - lx;
-
-  // reference in forward
-  // T v1 = input[y_low * width + x_low];
-  // T v2 = input[y_low * width + x_high];
-  // T v3 = input[y_high * width + x_low];
-  // T v4 = input[y_high * width + x_high];
-  // T val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
-
-  w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
-
-  return;
-}
 #endif  // COMMON_CUDA_HELPER
diff --git a/backend_ops/tensorrt/common/trt_cuda_helper.cuh b/backend_ops/tensorrt/common/trt_cuda_helper.cuh
deleted file mode 100644
index a4635dcdd5..0000000000
--- a/backend_ops/tensorrt/common/trt_cuda_helper.cuh
+++ /dev/null
@@ -1,30 +0,0 @@
-#ifndef TRT_CUDA_HELPER_HPP
-#define TRT_CUDA_HELPER_HPP
-
-#define DIVUP(m, n) ((m) / (n) + ((m) % (n) > 0))
-
-#define cudaCheckError()                                       \
-  {                                                            \
-    cudaError_t e = cudaGetLastError();                        \
-    if (e != cudaSuccess) {                                    \
-      printf("Cuda failure %s:%d: '%s'\n", __FILE__, __LINE__, \
-             cudaGetErrorString(e));                           \
-      exit(0);                                                 \
-    }                                                          \
-  }
-
-/**
- * Returns a view of the original tensor with its dimensions permuted.
- *
- * @param[out] dst pointer to the destination tensor
- * @param[in] src pointer to the source tensor
- * @param[in] src_size shape of the src tensor
- * @param[in] permute The desired ordering of dimensions
- * @param[in] src_dim dim of src tensor
- * @param[in] stream cuda stream handle
- */
-template <class scalar_t>
-void memcpyPermute(scalar_t *dst, const scalar_t *src, int *src_size,
-                   int *permute, int src_dim, cudaStream_t stream = 0);
-
-#endif  // TRT_CUDA_HELPER_HPP
diff --git a/backend_ops/tensorrt/common/trt_plugin_base.hpp b/backend_ops/tensorrt/common/trt_plugin_base.hpp
new file mode 100644
index 0000000000..986ca2b41e
--- /dev/null
+++ b/backend_ops/tensorrt/common/trt_plugin_base.hpp
@@ -0,0 +1,55 @@
+#ifndef TRT_PLUGIN_BASE_HPP
+#define TRT_PLUGIN_BASE_HPP
+#include "NvInferPlugin.h"
+#include "trt_plugin_helper.hpp"
+
+namespace mmlab {
+class TRTPluginBase : public nvinfer1::IPluginV2DynamicExt {
+ public:
+  TRTPluginBase(const std::string &name) : mLayerName(name) {}
+  // IPluginV2 Methods
+  const char *getPluginVersion() const override { return "1"; }
+  int initialize() override { return STATUS_SUCCESS; }
+  void terminate() override {}
+  void destroy() override { delete this; }
+  void setPluginNamespace(const char *pluginNamespace) override {
+    mNamespace = pluginNamespace;
+  }
+  const char *getPluginNamespace() const override { return mNamespace.c_str(); }
+
+ protected:
+  const std::string mLayerName;
+  std::string mNamespace;
+
+ protected:
+  // To prevent compiler warnings.
+  using nvinfer1::IPluginV2DynamicExt::canBroadcastInputAcrossBatch;
+  using nvinfer1::IPluginV2DynamicExt::configurePlugin;
+  using nvinfer1::IPluginV2DynamicExt::enqueue;
+  using nvinfer1::IPluginV2DynamicExt::getOutputDimensions;
+  using nvinfer1::IPluginV2DynamicExt::getWorkspaceSize;
+  using nvinfer1::IPluginV2DynamicExt::isOutputBroadcastAcrossBatch;
+  using nvinfer1::IPluginV2DynamicExt::supportsFormat;
+};
+
+class TRTPluginCreatorBase : public nvinfer1::IPluginCreator {
+ public:
+  const char *getPluginVersion() const override { return "1"; };
+
+  const nvinfer1::PluginFieldCollection *getFieldNames() override {
+    return &mFC;
+  }
+
+  void setPluginNamespace(const char *pluginNamespace) override {
+    mNamespace = pluginNamespace;
+  }
+
+  const char *getPluginNamespace() const override { return mNamespace.c_str(); }
+
+ protected:
+  nvinfer1::PluginFieldCollection mFC;
+  std::vector<nvinfer1::PluginField> mPluginAttributes;
+  std::string mNamespace;
+};
+}  // namespace mmlab
+#endif
diff --git a/backend_ops/tensorrt/common/trt_plugin_helper.hpp b/backend_ops/tensorrt/common/trt_plugin_helper.hpp
index e8b019293a..2c8e3919bc 100644
--- a/backend_ops/tensorrt/common/trt_plugin_helper.hpp
+++ b/backend_ops/tensorrt/common/trt_plugin_helper.hpp
@@ -1,5 +1,6 @@
 #ifndef TRT_PLUGIN_HELPER_HPP
 #define TRT_PLUGIN_HELPER_HPP
+#include <iostream>
 #include <stdexcept>
 
 #include "NvInferPlugin.h"
diff --git a/backend_ops/tensorrt/common/trt_serialize.hpp b/backend_ops/tensorrt/common/trt_serialize.hpp
index 1f0899fdfe..dfbf735a7d 100644
--- a/backend_ops/tensorrt/common/trt_serialize.hpp
+++ b/backend_ops/tensorrt/common/trt_serialize.hpp
@@ -5,12 +5,8 @@
 #define TRT_SERIALIZE_HPP
 #include <cassert>
 #include <cstring>
-#include <iostream>
 #include <type_traits>
 #include <vector>
-using std::cerr;
-using std::cout;
-using std::endl;
 
 template <typename T>
 inline void serialize_value(void** buffer, T const& value);
diff --git a/backend_ops/tensorrt/common_impl/trt_cuda_helper.cu b/backend_ops/tensorrt/common_impl/trt_cuda_helper.cu
index 3aa7014ffc..b4f673e501 100644
--- a/backend_ops/tensorrt/common_impl/trt_cuda_helper.cu
+++ b/backend_ops/tensorrt/common_impl/trt_cuda_helper.cu
@@ -1,5 +1,4 @@
 #include "common_cuda_helper.hpp"
-#include "trt_cuda_helper.cuh"
 #include "trt_plugin_helper.hpp"
 
 using mmlab::TensorDesc;
diff --git a/backend_ops/tensorrt/multi_level_roi_align/trt_multi_level_roi_align.cpp b/backend_ops/tensorrt/multi_level_roi_align/trt_multi_level_roi_align.cpp
index 8aae0ed3f1..cbf3eb7122 100644
--- a/backend_ops/tensorrt/multi_level_roi_align/trt_multi_level_roi_align.cpp
+++ b/backend_ops/tensorrt/multi_level_roi_align/trt_multi_level_roi_align.cpp
@@ -6,18 +6,19 @@
 #include <chrono>
 
 #include "trt_multi_level_roi_align_kernel.hpp"
+#include "trt_plugin_helper.hpp"
 #include "trt_serialize.hpp"
-
+namespace mmlab {
 namespace {
 static const char *PLUGIN_VERSION{"1"};
 static const char *PLUGIN_NAME{"MMCVMultiLevelRoiAlign"};
 }  // namespace
 
-MultiLevelRoiAlignPluginDynamic::MultiLevelRoiAlignPluginDynamic(
+TRTMultiLevelRoiAlign::TRTMultiLevelRoiAlign(
     const std::string &name, int alignedHeight, int alignedWidth, int sampleNum,
     const std::vector<float> &featmapStrides, float roiScaleFactor,
     int finestScale, bool aligned)
-    : mLayerName(name),
+    : TRTPluginBase(name),
       mAlignedHeight(alignedHeight),
       mAlignedWidth(alignedWidth),
       mSampleNum(sampleNum),
@@ -26,9 +27,9 @@ MultiLevelRoiAlignPluginDynamic::MultiLevelRoiAlignPluginDynamic(
       mFinestScale(finestScale),
       mAligned(aligned) {}
 
-MultiLevelRoiAlignPluginDynamic::MultiLevelRoiAlignPluginDynamic(
-    const std::string name, const void *data, size_t length)
-    : mLayerName(name) {
+TRTMultiLevelRoiAlign::TRTMultiLevelRoiAlign(const std::string name,
+                                             const void *data, size_t length)
+    : TRTPluginBase(name) {
   deserialize_value(&data, &length, &mAlignedHeight);
   deserialize_value(&data, &length, &mAlignedWidth);
   deserialize_value(&data, &length, &mSampleNum);
@@ -38,8 +39,8 @@ MultiLevelRoiAlignPluginDynamic::MultiLevelRoiAlignPluginDynamic(
   deserialize_value(&data, &length, &mFeatmapStrides);
 }
 
-nvinfer1::IPluginV2DynamicExt *MultiLevelRoiAlignPluginDynamic::clone() const {
-  MultiLevelRoiAlignPluginDynamic *plugin = new MultiLevelRoiAlignPluginDynamic(
+nvinfer1::IPluginV2DynamicExt *TRTMultiLevelRoiAlign::clone() const {
+  TRTMultiLevelRoiAlign *plugin = new TRTMultiLevelRoiAlign(
       mLayerName, mAlignedHeight, mAlignedWidth, mSampleNum, mFeatmapStrides,
       mRoiScaleFactor, mFinestScale, mAligned);
   plugin->setPluginNamespace(getPluginNamespace());
@@ -47,10 +48,10 @@ nvinfer1::IPluginV2DynamicExt *MultiLevelRoiAlignPluginDynamic::clone() const {
   return plugin;
 }
 
-nvinfer1::DimsExprs MultiLevelRoiAlignPluginDynamic::getOutputDimensions(
+nvinfer1::DimsExprs TRTMultiLevelRoiAlign::getOutputDimensions(
     int outputIndex, const nvinfer1::DimsExprs *inputs, int nbInputs,
     nvinfer1::IExprBuilder &exprBuilder) {
-  assert(nbInputs == mFeatmapStrides.size() + 1);
+  ASSERT(nbInputs == mFeatmapStrides.size() + 1);
 
   nvinfer1::DimsExprs ret;
   ret.nbDims = 4;
@@ -62,33 +63,32 @@ nvinfer1::DimsExprs MultiLevelRoiAlignPluginDynamic::getOutputDimensions(
   return ret;
 }
 
-bool MultiLevelRoiAlignPluginDynamic::supportsFormatCombination(
+bool TRTMultiLevelRoiAlign::supportsFormatCombination(
     int pos, const nvinfer1::PluginTensorDesc *inOut, int nbInputs,
     int nbOutputs) {
-  const auto *in = inOut;
-  const auto *out = inOut + nbInputs;
   return inOut[pos].type == nvinfer1::DataType::kFLOAT &&
          inOut[pos].format == nvinfer1::TensorFormat::kLINEAR;
 }
 
-void MultiLevelRoiAlignPluginDynamic::configurePlugin(
+void TRTMultiLevelRoiAlign::configurePlugin(
     const nvinfer1::DynamicPluginTensorDesc *inputs, int nbInputs,
     const nvinfer1::DynamicPluginTensorDesc *outputs, int nbOutputs) {
   // Validate input arguments
-  assert(nbOutputs == 1);
-  assert(nbInputs == mFeatmapStrides.size() + 1);
+  ASSERT(nbOutputs == 1);
+  ASSERT(nbInputs == mFeatmapStrides.size() + 1);
 }
 
-size_t MultiLevelRoiAlignPluginDynamic::getWorkspaceSize(
+size_t TRTMultiLevelRoiAlign::getWorkspaceSize(
     const nvinfer1::PluginTensorDesc *inputs, int nbInputs,
     const nvinfer1::PluginTensorDesc *outputs, int nbOutputs) const {
   return 0;
 }
 
-int MultiLevelRoiAlignPluginDynamic::enqueue(
-    const nvinfer1::PluginTensorDesc *inputDesc,
-    const nvinfer1::PluginTensorDesc *outputDesc, const void *const *inputs,
-    void *const *outputs, void *workSpace, cudaStream_t stream) {
+int TRTMultiLevelRoiAlign::enqueue(const nvinfer1::PluginTensorDesc *inputDesc,
+                                   const nvinfer1::PluginTensorDesc *outputDesc,
+                                   const void *const *inputs,
+                                   void *const *outputs, void *workSpace,
+                                   cudaStream_t stream) {
   int num_rois = inputDesc[0].dims.d[0];
   int batch_size = inputDesc[1].dims.d[0];
   int channels = inputDesc[1].dims.d[1];
@@ -117,34 +117,28 @@ int MultiLevelRoiAlignPluginDynamic::enqueue(
   return 0;
 }
 
-nvinfer1::DataType MultiLevelRoiAlignPluginDynamic::getOutputDataType(
+nvinfer1::DataType TRTMultiLevelRoiAlign::getOutputDataType(
     int index, const nvinfer1::DataType *inputTypes, int nbInputs) const {
   return nvinfer1::DataType::kFLOAT;
 }
 
 // IPluginV2 Methods
-const char *MultiLevelRoiAlignPluginDynamic::getPluginType() const {
-  return PLUGIN_NAME;
-}
+const char *TRTMultiLevelRoiAlign::getPluginType() const { return PLUGIN_NAME; }
 
-const char *MultiLevelRoiAlignPluginDynamic::getPluginVersion() const {
+const char *TRTMultiLevelRoiAlign::getPluginVersion() const {
   return PLUGIN_VERSION;
 }
 
-int MultiLevelRoiAlignPluginDynamic::getNbOutputs() const { return 1; }
-
-int MultiLevelRoiAlignPluginDynamic::initialize() { return 0; }
-
-void MultiLevelRoiAlignPluginDynamic::terminate() {}
+int TRTMultiLevelRoiAlign::getNbOutputs() const { return 1; }
 
-size_t MultiLevelRoiAlignPluginDynamic::getSerializationSize() const {
+size_t TRTMultiLevelRoiAlign::getSerializationSize() const {
   return serialized_size(mFeatmapStrides) + serialized_size(mAlignedHeight) +
          serialized_size(mAlignedWidth) + serialized_size(mSampleNum) +
          serialized_size(mRoiScaleFactor) + serialized_size(mFinestScale) +
          serialized_size(mAligned);
 }
 
-void MultiLevelRoiAlignPluginDynamic::serialize(void *buffer) const {
+void TRTMultiLevelRoiAlign::serialize(void *buffer) const {
   serialize_value(&buffer, mAlignedHeight);
   serialize_value(&buffer, mAlignedWidth);
   serialize_value(&buffer, mSampleNum);
@@ -154,22 +148,7 @@ void MultiLevelRoiAlignPluginDynamic::serialize(void *buffer) const {
   serialize_value(&buffer, mFeatmapStrides);
 }
 
-void MultiLevelRoiAlignPluginDynamic::destroy() {
-  // This gets called when the network containing plugin is destroyed
-  delete this;
-}
-
-void MultiLevelRoiAlignPluginDynamic::setPluginNamespace(
-    const char *libNamespace) {
-  mNamespace = libNamespace;
-}
-
-const char *MultiLevelRoiAlignPluginDynamic::getPluginNamespace() const {
-  return mNamespace.c_str();
-}
-
-MultiLevelRoiAlignPluginDynamicCreator::
-    MultiLevelRoiAlignPluginDynamicCreator() {
+TRTMultiLevelRoiAlignCreator::TRTMultiLevelRoiAlignCreator() {
   mPluginAttributes = std::vector<nvinfer1::PluginField>(
       {nvinfer1::PluginField("output_height"),
        nvinfer1::PluginField("output_width"),
@@ -182,20 +161,15 @@ MultiLevelRoiAlignPluginDynamicCreator::
   mFC.fields = mPluginAttributes.data();
 }
 
-const char *MultiLevelRoiAlignPluginDynamicCreator::getPluginName() const {
+const char *TRTMultiLevelRoiAlignCreator::getPluginName() const {
   return PLUGIN_NAME;
 }
 
-const char *MultiLevelRoiAlignPluginDynamicCreator::getPluginVersion() const {
+const char *TRTMultiLevelRoiAlignCreator::getPluginVersion() const {
   return PLUGIN_VERSION;
 }
 
-const nvinfer1::PluginFieldCollection *
-MultiLevelRoiAlignPluginDynamicCreator::getFieldNames() {
-  return &mFC;
-}
-
-nvinfer1::IPluginV2 *MultiLevelRoiAlignPluginDynamicCreator::createPlugin(
+nvinfer1::IPluginV2 *TRTMultiLevelRoiAlignCreator::createPlugin(
     const char *name, const nvinfer1::PluginFieldCollection *fc) {
   int alignedHeight = 7;
   int alignedWidth = 7;
@@ -231,30 +205,21 @@ nvinfer1::IPluginV2 *MultiLevelRoiAlignPluginDynamicCreator::createPlugin(
     }
   }
 
-  assert(featmapStrides.size() != 0);
+  ASSERT(featmapStrides.size() != 0);
 
-  MultiLevelRoiAlignPluginDynamic *plugin = new MultiLevelRoiAlignPluginDynamic(
+  TRTMultiLevelRoiAlign *plugin = new TRTMultiLevelRoiAlign(
       name, alignedHeight, alignedWidth, sampleNum, featmapStrides,
       roiScaleFactor, finestScale, aligned);
   plugin->setPluginNamespace(getPluginNamespace());
   return plugin;
 }
 
-nvinfer1::IPluginV2 *MultiLevelRoiAlignPluginDynamicCreator::deserializePlugin(
+nvinfer1::IPluginV2 *TRTMultiLevelRoiAlignCreator::deserializePlugin(
     const char *name, const void *serialData, size_t serialLength) {
-  auto plugin =
-      new MultiLevelRoiAlignPluginDynamic(name, serialData, serialLength);
+  auto plugin = new TRTMultiLevelRoiAlign(name, serialData, serialLength);
   plugin->setPluginNamespace(getPluginNamespace());
   return plugin;
 }
 
-void MultiLevelRoiAlignPluginDynamicCreator::setPluginNamespace(
-    const char *libNamespace) {
-  mNamespace = libNamespace;
-}
-
-const char *MultiLevelRoiAlignPluginDynamicCreator::getPluginNamespace() const {
-  return mNamespace.c_str();
-}
-
-REGISTER_TENSORRT_PLUGIN(MultiLevelRoiAlignPluginDynamicCreator);
+REGISTER_TENSORRT_PLUGIN(TRTMultiLevelRoiAlignCreator);
+}  // namespace mmlab
diff --git a/backend_ops/tensorrt/multi_level_roi_align/trt_multi_level_roi_align.hpp b/backend_ops/tensorrt/multi_level_roi_align/trt_multi_level_roi_align.hpp
index e8ba617bf6..d55d8cd0fe 100644
--- a/backend_ops/tensorrt/multi_level_roi_align/trt_multi_level_roi_align.hpp
+++ b/backend_ops/tensorrt/multi_level_roi_align/trt_multi_level_roi_align.hpp
@@ -7,22 +7,21 @@
 #include <string>
 #include <vector>
 
-#include "trt_plugin_helper.hpp"
+#include "trt_plugin_base.hpp"
 
-class MultiLevelRoiAlignPluginDynamic : public nvinfer1::IPluginV2DynamicExt {
+namespace mmlab {
+class TRTMultiLevelRoiAlign : public TRTPluginBase {
  public:
-  MultiLevelRoiAlignPluginDynamic(const std::string &name, int alignedHeight,
-                                  int alignedWidth, int sampleNum,
-                                  const std::vector<float> &featmapStrides,
-                                  float roiScaleFactor = -1,
-                                  int finestScale = 56, bool aligned = false);
+  TRTMultiLevelRoiAlign(const std::string &name, int alignedHeight,
+                        int alignedWidth, int sampleNum,
+                        const std::vector<float> &featmapStrides,
+                        float roiScaleFactor = -1, int finestScale = 56,
+                        bool aligned = false);
 
-  MultiLevelRoiAlignPluginDynamic(const std::string name, const void *data,
-                                  size_t length);
+  TRTMultiLevelRoiAlign(const std::string name, const void *data,
+                        size_t length);
 
-  // It doesn't make sense to make MultiLevelRoiAlignPluginDynamic without
-  // arguments, so we delete default constructor.
-  MultiLevelRoiAlignPluginDynamic() = delete;
+  TRTMultiLevelRoiAlign() = delete;
 
   // IPluginV2DynamicExt Methods
   nvinfer1::IPluginV2DynamicExt *clone() const override;
@@ -54,18 +53,10 @@ class MultiLevelRoiAlignPluginDynamic : public nvinfer1::IPluginV2DynamicExt {
   const char *getPluginType() const override;
   const char *getPluginVersion() const override;
   int getNbOutputs() const override;
-  int initialize() override;
-  void terminate() override;
   size_t getSerializationSize() const override;
   void serialize(void *buffer) const override;
-  void destroy() override;
-  void setPluginNamespace(const char *pluginNamespace) override;
-  const char *getPluginNamespace() const override;
 
  private:
-  const std::string mLayerName;
-  std::string mNamespace;
-
   int mAlignedHeight;
   int mAlignedWidth;
   int mSampleNum;
@@ -73,43 +64,22 @@ class MultiLevelRoiAlignPluginDynamic : public nvinfer1::IPluginV2DynamicExt {
   float mRoiScaleFactor;
   int mFinestScale;
   bool mAligned;
-
- protected:
-  // To prevent compiler warnings.
-  using nvinfer1::IPluginV2DynamicExt::canBroadcastInputAcrossBatch;
-  using nvinfer1::IPluginV2DynamicExt::configurePlugin;
-  using nvinfer1::IPluginV2DynamicExt::enqueue;
-  using nvinfer1::IPluginV2DynamicExt::getOutputDimensions;
-  using nvinfer1::IPluginV2DynamicExt::getWorkspaceSize;
-  using nvinfer1::IPluginV2DynamicExt::isOutputBroadcastAcrossBatch;
-  using nvinfer1::IPluginV2DynamicExt::supportsFormat;
 };
 
-class MultiLevelRoiAlignPluginDynamicCreator : public nvinfer1::IPluginCreator {
+class TRTMultiLevelRoiAlignCreator : public TRTPluginCreatorBase {
  public:
-  MultiLevelRoiAlignPluginDynamicCreator();
+  TRTMultiLevelRoiAlignCreator();
 
   const char *getPluginName() const override;
 
   const char *getPluginVersion() const override;
 
-  const nvinfer1::PluginFieldCollection *getFieldNames() override;
-
   nvinfer1::IPluginV2 *createPlugin(
       const char *name, const nvinfer1::PluginFieldCollection *fc) override;
 
   nvinfer1::IPluginV2 *deserializePlugin(const char *name,
                                          const void *serialData,
                                          size_t serialLength) override;
-
-  void setPluginNamespace(const char *pluginNamespace) override;
-
-  const char *getPluginNamespace() const override;
-
- private:
-  nvinfer1::PluginFieldCollection mFC;
-  std::vector<nvinfer1::PluginField> mPluginAttributes;
-  std::string mNamespace;
 };
-
+}  // namespace mmlab
 #endif  // TRT_ROI_ALIGN_HPP
diff --git a/backend_ops/tensorrt/multi_level_roi_align/trt_multi_level_roi_align_kernel.cu b/backend_ops/tensorrt/multi_level_roi_align/trt_multi_level_roi_align_kernel.cu
index a58fcfdfda..f85d349e1d 100644
--- a/backend_ops/tensorrt/multi_level_roi_align/trt_multi_level_roi_align_kernel.cu
+++ b/backend_ops/tensorrt/multi_level_roi_align/trt_multi_level_roi_align_kernel.cu
@@ -4,7 +4,6 @@
 #include <cmath>
 
 #include "common_cuda_helper.hpp"
-#include "trt_cuda_helper.cuh"
 #include "trt_multi_level_roi_align_kernel.hpp"
 #include "trt_plugin_helper.hpp"
 
@@ -19,53 +18,6 @@ struct FeatData {
   int num_featmap;
 };
 
-template <typename scalar_t>
-__device__ scalar_t bilinear_interpolate(const scalar_t *bottom_data,
-                                         const int height, const int width,
-                                         scalar_t y, scalar_t x) {
-  // deal with cases that inverse elements are out of feature map boundary
-  if (y < -1.0 || y > height || x < -1.0 || x > width) {
-    return 0;
-  }
-
-  if (y <= 0) y = 0;
-  if (x <= 0) x = 0;
-
-  int y_low = (int)y;
-  int x_low = (int)x;
-  int y_high;
-  int x_high;
-
-  if (y_low >= height - 1) {
-    y_high = y_low = height - 1;
-    y = (scalar_t)y_low;
-  } else {
-    y_high = y_low + 1;
-  }
-
-  if (x_low >= width - 1) {
-    x_high = x_low = width - 1;
-    x = (scalar_t)x_low;
-  } else {
-    x_high = x_low + 1;
-  }
-
-  scalar_t ly = y - y_low;
-  scalar_t lx = x - x_low;
-  scalar_t hy = 1. - ly;
-  scalar_t hx = 1. - lx;
-  // do bilinear interpolation
-  scalar_t lt = bottom_data[y_low * width + x_low];
-  scalar_t rt = bottom_data[y_low * width + x_high];
-  scalar_t lb = bottom_data[y_high * width + x_low];
-  scalar_t rb = bottom_data[y_high * width + x_high];
-  scalar_t w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
-
-  scalar_t val = (w1 * lt + w2 * rt + w3 * lb + w4 * rb);
-
-  return val;
-}
-
 template <typename scalar_t>
 __device__ scalar_t roi_align_single(
     const scalar_t *bottom_data, const int roi_batch_ind,
diff --git a/backend_ops/tensorrt/nms/trt_nms.cpp b/backend_ops/tensorrt/nms/trt_nms.cpp
index d82be1bd3c..3335b448d7 100644
--- a/backend_ops/tensorrt/nms/trt_nms.cpp
+++ b/backend_ops/tensorrt/nms/trt_nms.cpp
@@ -5,39 +5,26 @@
 
 #include <chrono>
 
+#include "trt_nms_kernel.hpp"
 #include "trt_serialize.hpp"
-
-extern size_t get_onnxnms_workspace_size(
-    size_t num_batches, size_t spatial_dimension, size_t num_classes,
-    size_t boxes_word_size, int center_point_box, size_t output_length);
-
-extern void TRTNMSCUDAKernelLauncher_float(
-    const float *boxes, const float *scores,
-    const int max_output_boxes_per_class, const float iou_threshold,
-    const float score_threshold, const int offset, int *output,
-    int center_point_box, int num_batches, int spatial_dimension,
-    int num_classes, size_t output_length, void *workspace,
-    cudaStream_t stream);
-
+namespace mmlab {
 namespace {
 static const char *PLUGIN_VERSION{"1"};
 static const char *PLUGIN_NAME{"NonMaxSuppression"};
 }  // namespace
 
-NonMaxSuppressionDynamic::NonMaxSuppressionDynamic(
-    const std::string &name, int centerPointBox, int maxOutputBoxesPerClass,
-    float iouThreshold, float scoreThreshold, int offset)
-    : mLayerName(name),
+TRTNMS::TRTNMS(const std::string &name, int centerPointBox,
+               int maxOutputBoxesPerClass, float iouThreshold,
+               float scoreThreshold, int offset)
+    : TRTPluginBase(name),
       mCenterPointBox(centerPointBox),
       mMaxOutputBoxesPerClass(maxOutputBoxesPerClass),
       mIouThreshold(iouThreshold),
       mScoreThreshold(scoreThreshold),
       mOffset(offset) {}
 
-NonMaxSuppressionDynamic::NonMaxSuppressionDynamic(const std::string name,
-                                                   const void *data,
-                                                   size_t length)
-    : mLayerName(name) {
+TRTNMS::TRTNMS(const std::string name, const void *data, size_t length)
+    : TRTPluginBase(name) {
   deserialize_value(&data, &length, &mCenterPointBox);
   deserialize_value(&data, &length, &mMaxOutputBoxesPerClass);
   deserialize_value(&data, &length, &mIouThreshold);
@@ -45,16 +32,16 @@ NonMaxSuppressionDynamic::NonMaxSuppressionDynamic(const std::string name,
   deserialize_value(&data, &length, &mOffset);
 }
 
-nvinfer1::IPluginV2DynamicExt *NonMaxSuppressionDynamic::clone() const {
-  NonMaxSuppressionDynamic *plugin = new NonMaxSuppressionDynamic(
-      mLayerName, mCenterPointBox, mMaxOutputBoxesPerClass, mIouThreshold,
-      mScoreThreshold, mOffset);
+nvinfer1::IPluginV2DynamicExt *TRTNMS::clone() const {
+  TRTNMS *plugin =
+      new TRTNMS(mLayerName, mCenterPointBox, mMaxOutputBoxesPerClass,
+                 mIouThreshold, mScoreThreshold, mOffset);
   plugin->setPluginNamespace(getPluginNamespace());
 
   return plugin;
 }
 
-nvinfer1::DimsExprs NonMaxSuppressionDynamic::getOutputDimensions(
+nvinfer1::DimsExprs TRTNMS::getOutputDimensions(
     int outputIndex, const nvinfer1::DimsExprs *inputs, int nbInputs,
     nvinfer1::IExprBuilder &exprBuilder) {
   nvinfer1::DimsExprs ret;
@@ -76,9 +63,9 @@ nvinfer1::DimsExprs NonMaxSuppressionDynamic::getOutputDimensions(
   return ret;
 }
 
-bool NonMaxSuppressionDynamic::supportsFormatCombination(
-    int pos, const nvinfer1::PluginTensorDesc *inOut, int nbInputs,
-    int nbOutputs) {
+bool TRTNMS::supportsFormatCombination(int pos,
+                                       const nvinfer1::PluginTensorDesc *inOut,
+                                       int nbInputs, int nbOutputs) {
   if (pos < nbInputs) {
     switch (pos) {
       case 0:
@@ -105,13 +92,15 @@ bool NonMaxSuppressionDynamic::supportsFormatCombination(
   return true;
 }
 
-void NonMaxSuppressionDynamic::configurePlugin(
-    const nvinfer1::DynamicPluginTensorDesc *inputs, int nbInputs,
-    const nvinfer1::DynamicPluginTensorDesc *outputs, int nbOutputs) {}
+void TRTNMS::configurePlugin(const nvinfer1::DynamicPluginTensorDesc *inputs,
+                             int nbInputs,
+                             const nvinfer1::DynamicPluginTensorDesc *outputs,
+                             int nbOutputs) {}
 
-size_t NonMaxSuppressionDynamic::getWorkspaceSize(
-    const nvinfer1::PluginTensorDesc *inputs, int nbInputs,
-    const nvinfer1::PluginTensorDesc *outputs, int nbOutputs) const {
+size_t TRTNMS::getWorkspaceSize(const nvinfer1::PluginTensorDesc *inputs,
+                                int nbInputs,
+                                const nvinfer1::PluginTensorDesc *outputs,
+                                int nbOutputs) const {
   size_t boxes_word_size = mmlab::getElementSize(inputs[0].type);
   size_t num_batches = inputs[0].dims.d[0];
   size_t spatial_dimension = inputs[0].dims.d[1];
@@ -123,10 +112,10 @@ size_t NonMaxSuppressionDynamic::getWorkspaceSize(
                                     output_length);
 }
 
-int NonMaxSuppressionDynamic::enqueue(
-    const nvinfer1::PluginTensorDesc *inputDesc,
-    const nvinfer1::PluginTensorDesc *outputDesc, const void *const *inputs,
-    void *const *outputs, void *workSpace, cudaStream_t stream) {
+int TRTNMS::enqueue(const nvinfer1::PluginTensorDesc *inputDesc,
+                    const nvinfer1::PluginTensorDesc *outputDesc,
+                    const void *const *inputs, void *const *outputs,
+                    void *workSpace, cudaStream_t stream) {
   int num_batches = inputDesc[0].dims.d[0];
   int spatial_dimension = inputDesc[0].dims.d[1];
   int num_classes = inputDesc[1].dims.d[1];
@@ -135,42 +124,34 @@ int NonMaxSuppressionDynamic::enqueue(
   const float *boxes = (const float *)inputs[0];
   const float *scores = (const float *)inputs[1];
   int *output = (int *)outputs[0];
-  TRTNMSCUDAKernelLauncher_float(
-      boxes, scores, mMaxOutputBoxesPerClass, mIouThreshold, mScoreThreshold,
-      mOffset, output, mCenterPointBox, num_batches, spatial_dimension,
-      num_classes, output_length, workSpace, stream);
+  NMSCUDAKernelLauncher_float(boxes, scores, mMaxOutputBoxesPerClass,
+                              mIouThreshold, mScoreThreshold, mOffset, output,
+                              mCenterPointBox, num_batches, spatial_dimension,
+                              num_classes, output_length, workSpace, stream);
 
   return 0;
 }
 
-nvinfer1::DataType NonMaxSuppressionDynamic::getOutputDataType(
+nvinfer1::DataType TRTNMS::getOutputDataType(
     int index, const nvinfer1::DataType *inputTypes, int nbInputs) const {
   return nvinfer1::DataType::kINT32;
 }
 
 // IPluginV2 Methods
-const char *NonMaxSuppressionDynamic::getPluginType() const {
-  return PLUGIN_NAME;
-}
-
-const char *NonMaxSuppressionDynamic::getPluginVersion() const {
-  return PLUGIN_VERSION;
-}
+const char *TRTNMS::getPluginType() const { return PLUGIN_NAME; }
 
-int NonMaxSuppressionDynamic::getNbOutputs() const { return 1; }
+const char *TRTNMS::getPluginVersion() const { return PLUGIN_VERSION; }
 
-int NonMaxSuppressionDynamic::initialize() { return 0; }
+int TRTNMS::getNbOutputs() const { return 1; }
 
-void NonMaxSuppressionDynamic::terminate() {}
-
-size_t NonMaxSuppressionDynamic::getSerializationSize() const {
+size_t TRTNMS::getSerializationSize() const {
   return serialized_size(mCenterPointBox) +
          serialized_size(mMaxOutputBoxesPerClass) +
          serialized_size(mIouThreshold) + serialized_size(mScoreThreshold) +
          serialized_size(mOffset);
 }
 
-void NonMaxSuppressionDynamic::serialize(void *buffer) const {
+void TRTNMS::serialize(void *buffer) const {
   serialize_value(&buffer, mCenterPointBox);
   serialize_value(&buffer, mMaxOutputBoxesPerClass);
   serialize_value(&buffer, mIouThreshold);
@@ -178,22 +159,7 @@ void NonMaxSuppressionDynamic::serialize(void *buffer) const {
   serialize_value(&buffer, mOffset);
 }
 
-void NonMaxSuppressionDynamic::destroy() {
-  // This gets called when the network containing plugin is destroyed
-  delete this;
-}
-
-void NonMaxSuppressionDynamic::setPluginNamespace(const char *libNamespace) {
-  mNamespace = libNamespace;
-}
-
-const char *NonMaxSuppressionDynamic::getPluginNamespace() const {
-  return mNamespace.c_str();
-}
-
-////////////////////// creator /////////////////////////////
-
-NonMaxSuppressionDynamicCreator::NonMaxSuppressionDynamicCreator() {
+TRTNMSCreator::TRTNMSCreator() {
   mPluginAttributes.clear();
   mPluginAttributes.emplace_back(nvinfer1::PluginField("center_point_box"));
   mPluginAttributes.emplace_back(
@@ -205,20 +171,11 @@ NonMaxSuppressionDynamicCreator::NonMaxSuppressionDynamicCreator() {
   mFC.fields = mPluginAttributes.data();
 }
 
-const char *NonMaxSuppressionDynamicCreator::getPluginName() const {
-  return PLUGIN_NAME;
-}
-
-const char *NonMaxSuppressionDynamicCreator::getPluginVersion() const {
-  return PLUGIN_VERSION;
-}
+const char *TRTNMSCreator::getPluginName() const { return PLUGIN_NAME; }
 
-const nvinfer1::PluginFieldCollection *
-NonMaxSuppressionDynamicCreator::getFieldNames() {
-  return &mFC;
-}
+const char *TRTNMSCreator::getPluginVersion() const { return PLUGIN_VERSION; }
 
-nvinfer1::IPluginV2 *NonMaxSuppressionDynamicCreator::createPlugin(
+nvinfer1::IPluginV2 *TRTNMSCreator::createPlugin(
     const char *name, const nvinfer1::PluginFieldCollection *fc) {
   int centerPointBox = 0;
   int maxOutputBoxesPerClass = 0;
@@ -252,27 +209,18 @@ nvinfer1::IPluginV2 *NonMaxSuppressionDynamicCreator::createPlugin(
       offset = static_cast<const int *>(fc->fields[i].data)[0];
     }
   }
-  NonMaxSuppressionDynamic *plugin =
-      new NonMaxSuppressionDynamic(name, centerPointBox, maxOutputBoxesPerClass,
-                                   iouThreshold, scoreThreshold, offset);
+  TRTNMS *plugin = new TRTNMS(name, centerPointBox, maxOutputBoxesPerClass,
+                              iouThreshold, scoreThreshold, offset);
   plugin->setPluginNamespace(getPluginNamespace());
   return plugin;
 }
 
-nvinfer1::IPluginV2 *NonMaxSuppressionDynamicCreator::deserializePlugin(
-    const char *name, const void *serialData, size_t serialLength) {
-  auto plugin = new NonMaxSuppressionDynamic(name, serialData, serialLength);
+nvinfer1::IPluginV2 *TRTNMSCreator::deserializePlugin(const char *name,
+                                                      const void *serialData,
+                                                      size_t serialLength) {
+  auto plugin = new TRTNMS(name, serialData, serialLength);
   plugin->setPluginNamespace(getPluginNamespace());
   return plugin;
 }
-
-void NonMaxSuppressionDynamicCreator::setPluginNamespace(
-    const char *libNamespace) {
-  mNamespace = libNamespace;
-}
-
-const char *NonMaxSuppressionDynamicCreator::getPluginNamespace() const {
-  return mNamespace.c_str();
-}
-
-REGISTER_TENSORRT_PLUGIN(NonMaxSuppressionDynamicCreator);
+REGISTER_TENSORRT_PLUGIN(TRTNMSCreator);
+}  // namespace mmlab
diff --git a/backend_ops/tensorrt/nms/trt_nms.hpp b/backend_ops/tensorrt/nms/trt_nms.hpp
index a696ea73c5..1970113365 100644
--- a/backend_ops/tensorrt/nms/trt_nms.hpp
+++ b/backend_ops/tensorrt/nms/trt_nms.hpp
@@ -6,18 +6,17 @@
 #include <string>
 #include <vector>
 
-#include "trt_plugin_helper.hpp"
-
-class NonMaxSuppressionDynamic : public nvinfer1::IPluginV2DynamicExt {
+#include "trt_plugin_base.hpp"
+namespace mmlab {
+class TRTNMS : public TRTPluginBase {
  public:
-  NonMaxSuppressionDynamic(const std::string &name, int centerPointBox,
-                           int maxOutputBoxesPerClass, float iouThreshold,
-                           float scoreThreshold, int offset);
+  TRTNMS(const std::string &name, int centerPointBox,
+         int maxOutputBoxesPerClass, float iouThreshold, float scoreThreshold,
+         int offset);
 
-  NonMaxSuppressionDynamic(const std::string name, const void *data,
-                           size_t length);
+  TRTNMS(const std::string name, const void *data, size_t length);
 
-  NonMaxSuppressionDynamic() = delete;
+  TRTNMS() = delete;
 
   // IPluginV2DynamicExt Methods
   nvinfer1::IPluginV2DynamicExt *clone() const override;
@@ -49,59 +48,31 @@ class NonMaxSuppressionDynamic : public nvinfer1::IPluginV2DynamicExt {
   const char *getPluginType() const override;
   const char *getPluginVersion() const override;
   int getNbOutputs() const override;
-  int initialize() override;
-  void terminate() override;
   size_t getSerializationSize() const override;
   void serialize(void *buffer) const override;
-  void destroy() override;
-  void setPluginNamespace(const char *pluginNamespace) override;
-  const char *getPluginNamespace() const override;
 
  private:
-  const std::string mLayerName;
-  std::string mNamespace;
-
   int mCenterPointBox;
   int mMaxOutputBoxesPerClass;
   float mIouThreshold;
   float mScoreThreshold;
   int mOffset;
-
- protected:
-  // To prevent compiler warnings.
-  using nvinfer1::IPluginV2DynamicExt::canBroadcastInputAcrossBatch;
-  using nvinfer1::IPluginV2DynamicExt::configurePlugin;
-  using nvinfer1::IPluginV2DynamicExt::enqueue;
-  using nvinfer1::IPluginV2DynamicExt::getOutputDimensions;
-  using nvinfer1::IPluginV2DynamicExt::getWorkspaceSize;
-  using nvinfer1::IPluginV2DynamicExt::isOutputBroadcastAcrossBatch;
-  using nvinfer1::IPluginV2DynamicExt::supportsFormat;
 };
 
-class NonMaxSuppressionDynamicCreator : public nvinfer1::IPluginCreator {
+class TRTNMSCreator : public TRTPluginCreatorBase {
  public:
-  NonMaxSuppressionDynamicCreator();
+  TRTNMSCreator();
 
   const char *getPluginName() const override;
 
   const char *getPluginVersion() const override;
 
-  const nvinfer1::PluginFieldCollection *getFieldNames() override;
-
   nvinfer1::IPluginV2 *createPlugin(
       const char *name, const nvinfer1::PluginFieldCollection *fc) override;
 
   nvinfer1::IPluginV2 *deserializePlugin(const char *name,
                                          const void *serialData,
                                          size_t serialLength) override;
-
-  void setPluginNamespace(const char *pluginNamespace) override;
-
-  const char *getPluginNamespace() const override;
-
- private:
-  nvinfer1::PluginFieldCollection mFC;
-  std::vector<nvinfer1::PluginField> mPluginAttributes;
-  std::string mNamespace;
 };
+}  // namespace mmlab
 #endif  // TRT_NMS_HPP
diff --git a/backend_ops/tensorrt/nms/trt_nms_kernel.cu b/backend_ops/tensorrt/nms/trt_nms_kernel.cu
index bd12f3ac55..324db93a70 100644
--- a/backend_ops/tensorrt/nms/trt_nms_kernel.cu
+++ b/backend_ops/tensorrt/nms/trt_nms_kernel.cu
@@ -1,3 +1,4 @@
+#include <float.h>
 #include <stdio.h>
 #include <thrust/execution_policy.h>
 #include <thrust/gather.h>
@@ -9,8 +10,7 @@
 #include <vector>
 
 #include "common_cuda_helper.hpp"
-#include "trt_cuda_helper.cuh"
-#include "trt_nms_kernel.cuh"
+#include "trt_nms_kernel.hpp"
 #include "trt_plugin_helper.hpp"
 
 struct NMSBox {
@@ -50,6 +50,65 @@ struct nms_score_threshold {
   }
 };
 
+static int const threadsPerBlock = sizeof(unsigned long long int) * 8;
+
+__device__ inline bool devIoU(float const* const a, float const* const b,
+                              const int offset, const float threshold) {
+  float left = fmaxf(a[0], b[0]), right = fminf(a[2], b[2]);
+  float top = fmaxf(a[1], b[1]), bottom = fminf(a[3], b[3]);
+  float width = fmaxf(right - left + offset, 0.f),
+        height = fmaxf(bottom - top + offset, 0.f);
+  float interS = width * height;
+  float Sa = (a[2] - a[0] + offset) * (a[3] - a[1] + offset);
+  float Sb = (b[2] - b[0] + offset) * (b[3] - b[1] + offset);
+  return interS > threshold * (Sa + Sb - interS);
+}
+
+__global__ void nms_cuda(const int n_boxes, const float iou_threshold,
+                         const int offset, const float* dev_boxes,
+                         unsigned long long* dev_mask) {
+  const int row_start = blockIdx.y;
+  const int col_start = blockIdx.x;
+  const int tid = threadIdx.x;
+
+  if (row_start > col_start) return;
+
+  const int row_size =
+      fminf(n_boxes - row_start * threadsPerBlock, threadsPerBlock);
+  const int col_size =
+      fminf(n_boxes - col_start * threadsPerBlock, threadsPerBlock);
+
+  __shared__ float block_boxes[threadsPerBlock * 4];
+  if (tid < col_size) {
+    block_boxes[tid * 4 + 0] =
+        dev_boxes[(threadsPerBlock * col_start + tid) * 4 + 0];
+    block_boxes[tid * 4 + 1] =
+        dev_boxes[(threadsPerBlock * col_start + tid) * 4 + 1];
+    block_boxes[tid * 4 + 2] =
+        dev_boxes[(threadsPerBlock * col_start + tid) * 4 + 2];
+    block_boxes[tid * 4 + 3] =
+        dev_boxes[(threadsPerBlock * col_start + tid) * 4 + 3];
+  }
+  __syncthreads();
+
+  if (tid < row_size) {
+    const int cur_box_idx = threadsPerBlock * row_start + tid;
+    const float* cur_box = dev_boxes + cur_box_idx * 4;
+    int i = 0;
+    unsigned long long int t = 0;
+    int start = 0;
+    if (row_start == col_start) {
+      start = tid + 1;
+    }
+    for (i = start; i < col_size; i++) {
+      if (devIoU(cur_box, block_boxes + i * 4, offset, iou_threshold)) {
+        t |= 1ULL << i;
+      }
+    }
+    dev_mask[cur_box_idx * gridDim.y + col_start] = t;
+  }
+}
+
 __global__ void nms_reindex_kernel(int n, int* output, int* index_cache) {
   CUDA_1D_KERNEL_LOOP(index, n) {
     const int old_index = output[index * 3 + 2];
@@ -152,15 +211,14 @@ size_t get_onnxnms_workspace_size(size_t num_batches, size_t spatial_dimension,
  * @param[in] workspace memory for all temporary variables.
  * @param[in] stream cuda stream
  */
-void TRTNMSCUDAKernelLauncher_float(const float* boxes, const float* scores,
-                                    const int max_output_boxes_per_class,
-                                    const float iou_threshold,
-                                    const float score_threshold,
-                                    const int offset, int* output,
-                                    int center_point_box, int num_batches,
-                                    int spatial_dimension, int num_classes,
-                                    size_t output_length, void* workspace,
-                                    cudaStream_t stream) {
+void NMSCUDAKernelLauncher_float(const float* boxes, const float* scores,
+                                 const int max_output_boxes_per_class,
+                                 const float iou_threshold,
+                                 const float score_threshold, const int offset,
+                                 int* output, int center_point_box,
+                                 int num_batches, int spatial_dimension,
+                                 int num_classes, size_t output_length,
+                                 void* workspace, cudaStream_t stream) {
   using mmlab::getAlignedSize;
   const int col_blocks = DIVUP(spatial_dimension, THREADS_PER_BLOCK);
   float* boxes_sorted = (float*)workspace;
diff --git a/backend_ops/tensorrt/nms/trt_nms_kernel.cuh b/backend_ops/tensorrt/nms/trt_nms_kernel.cuh
deleted file mode 100644
index 9238aefefe..0000000000
--- a/backend_ops/tensorrt/nms/trt_nms_kernel.cuh
+++ /dev/null
@@ -1,67 +0,0 @@
-#ifndef NMS_CUDA_KERNEL_CUH
-#define NMS_CUDA_KERNEL_CUH
-
-#include <float.h>
-
-#include "common_cuda_helper.hpp"
-
-#define DIVUP(m, n) ((m) / (n) + ((m) % (n) > 0))
-int const threadsPerBlock = sizeof(unsigned long long int) * 8;
-
-__device__ inline bool devIoU(float const *const a, float const *const b,
-                              const int offset, const float threshold) {
-  float left = fmaxf(a[0], b[0]), right = fminf(a[2], b[2]);
-  float top = fmaxf(a[1], b[1]), bottom = fminf(a[3], b[3]);
-  float width = fmaxf(right - left + offset, 0.f),
-        height = fmaxf(bottom - top + offset, 0.f);
-  float interS = width * height;
-  float Sa = (a[2] - a[0] + offset) * (a[3] - a[1] + offset);
-  float Sb = (b[2] - b[0] + offset) * (b[3] - b[1] + offset);
-  return interS > threshold * (Sa + Sb - interS);
-}
-
-__global__ void nms_cuda(const int n_boxes, const float iou_threshold,
-                         const int offset, const float *dev_boxes,
-                         unsigned long long *dev_mask) {
-  const int row_start = blockIdx.y;
-  const int col_start = blockIdx.x;
-  const int tid = threadIdx.x;
-
-  if (row_start > col_start) return;
-
-  const int row_size =
-      fminf(n_boxes - row_start * threadsPerBlock, threadsPerBlock);
-  const int col_size =
-      fminf(n_boxes - col_start * threadsPerBlock, threadsPerBlock);
-
-  __shared__ float block_boxes[threadsPerBlock * 4];
-  if (tid < col_size) {
-    block_boxes[tid * 4 + 0] =
-        dev_boxes[(threadsPerBlock * col_start + tid) * 4 + 0];
-    block_boxes[tid * 4 + 1] =
-        dev_boxes[(threadsPerBlock * col_start + tid) * 4 + 1];
-    block_boxes[tid * 4 + 2] =
-        dev_boxes[(threadsPerBlock * col_start + tid) * 4 + 2];
-    block_boxes[tid * 4 + 3] =
-        dev_boxes[(threadsPerBlock * col_start + tid) * 4 + 3];
-  }
-  __syncthreads();
-
-  if (tid < row_size) {
-    const int cur_box_idx = threadsPerBlock * row_start + tid;
-    const float *cur_box = dev_boxes + cur_box_idx * 4;
-    int i = 0;
-    unsigned long long int t = 0;
-    int start = 0;
-    if (row_start == col_start) {
-      start = tid + 1;
-    }
-    for (i = start; i < col_size; i++) {
-      if (devIoU(cur_box, block_boxes + i * 4, offset, iou_threshold)) {
-        t |= 1ULL << i;
-      }
-    }
-    dev_mask[cur_box_idx * gridDim.y + col_start] = t;
-  }
-}
-#endif  // NMS_CUDA_KERNEL_CUH
diff --git a/backend_ops/tensorrt/nms/trt_nms_kernel.hpp b/backend_ops/tensorrt/nms/trt_nms_kernel.hpp
new file mode 100644
index 0000000000..91a522cb53
--- /dev/null
+++ b/backend_ops/tensorrt/nms/trt_nms_kernel.hpp
@@ -0,0 +1,18 @@
+#ifndef NMS_CUDA_KERNEL_HPP
+#define NMS_CUDA_KERNEL_HPP
+
+#include "common_cuda_helper.hpp"
+
+size_t get_onnxnms_workspace_size(size_t num_batches, size_t spatial_dimension,
+                                  size_t num_classes, size_t boxes_word_size,
+                                  int center_point_box, size_t output_length);
+
+void NMSCUDAKernelLauncher_float(const float *boxes, const float *scores,
+                                 const int max_output_boxes_per_class,
+                                 const float iou_threshold,
+                                 const float score_threshold, const int offset,
+                                 int *output, int center_point_box,
+                                 int num_batches, int spatial_dimension,
+                                 int num_classes, size_t output_length,
+                                 void *workspace, cudaStream_t stream);
+#endif  // NMS_CUDA_KERNEL_HPP
diff --git a/backend_ops/tensorrt/roi_align/trt_roi_align.cpp b/backend_ops/tensorrt/roi_align/trt_roi_align.cpp
index 0c14c96745..f646306beb 100644
--- a/backend_ops/tensorrt/roi_align/trt_roi_align.cpp
+++ b/backend_ops/tensorrt/roi_align/trt_roi_align.cpp
@@ -1,28 +1,23 @@
 #include "trt_roi_align.hpp"
 
-#include <assert.h>
-
 #include <chrono>
+#include <iostream>
 
+#include "common_cuda_helper.hpp"
+#include "trt_plugin_helper.hpp"
+#include "trt_roi_align_kernel.hpp"
 #include "trt_serialize.hpp"
 
-extern void TRTRoIAlignForwardCUDAKernelLauncher_float(
-    const float *input, const float *rois, float *output, float *argmax_y,
-    float *argmax_x, int output_size, int channels, int height, int width,
-    int aligned_height, int aligned_width, float spatial_scale,
-    int sampling_ratio, int pool_mode, bool aligned, cudaStream_t stream);
-
+namespace mmlab {
 namespace {
 static const char *PLUGIN_VERSION{"1"};
 static const char *PLUGIN_NAME{"MMCVRoiAlign"};
 }  // namespace
 
-RoIAlignPluginDynamic::RoIAlignPluginDynamic(const std::string &name,
-                                             int outWidth, int outHeight,
-                                             float spatialScale,
-                                             int sampleRatio, int poolMode,
-                                             bool aligned)
-    : mLayerName(name),
+TRTRoIAlign::TRTRoIAlign(const std::string &name, int outWidth, int outHeight,
+                         float spatialScale, int sampleRatio, int poolMode,
+                         bool aligned)
+    : TRTPluginBase(name),
       mOutWidth(outWidth),
       mOutHeight(outHeight),
       mSpatialScale(spatialScale),
@@ -30,9 +25,9 @@ RoIAlignPluginDynamic::RoIAlignPluginDynamic(const std::string &name,
       mPoolMode(poolMode),
       mAligned(aligned) {}
 
-RoIAlignPluginDynamic::RoIAlignPluginDynamic(const std::string name,
-                                             const void *data, size_t length)
-    : mLayerName(name) {
+TRTRoIAlign::TRTRoIAlign(const std::string name, const void *data,
+                         size_t length)
+    : TRTPluginBase(name) {
   deserialize_value(&data, &length, &mOutWidth);
   deserialize_value(&data, &length, &mOutHeight);
   deserialize_value(&data, &length, &mSpatialScale);
@@ -41,16 +36,16 @@ RoIAlignPluginDynamic::RoIAlignPluginDynamic(const std::string name,
   deserialize_value(&data, &length, &mAligned);
 }
 
-nvinfer1::IPluginV2DynamicExt *RoIAlignPluginDynamic::clone() const {
-  RoIAlignPluginDynamic *plugin = new RoIAlignPluginDynamic(
-      mLayerName, mOutWidth, mOutHeight, mSpatialScale, mSampleRatio, mPoolMode,
-      mAligned);
+nvinfer1::IPluginV2DynamicExt *TRTRoIAlign::clone() const {
+  TRTRoIAlign *plugin =
+      new TRTRoIAlign(mLayerName, mOutWidth, mOutHeight, mSpatialScale,
+                      mSampleRatio, mPoolMode, mAligned);
   plugin->setPluginNamespace(getPluginNamespace());
 
   return plugin;
 }
 
-nvinfer1::DimsExprs RoIAlignPluginDynamic::getOutputDimensions(
+nvinfer1::DimsExprs TRTRoIAlign::getOutputDimensions(
     int outputIndex, const nvinfer1::DimsExprs *inputs, int nbInputs,
     nvinfer1::IExprBuilder &exprBuilder) {
   nvinfer1::DimsExprs ret;
@@ -63,20 +58,21 @@ nvinfer1::DimsExprs RoIAlignPluginDynamic::getOutputDimensions(
   return ret;
 }
 
-bool RoIAlignPluginDynamic::supportsFormatCombination(
+bool TRTRoIAlign::supportsFormatCombination(
     int pos, const nvinfer1::PluginTensorDesc *inOut, int nbInputs,
     int nbOutputs) {
   return inOut[pos].type == nvinfer1::DataType::kFLOAT &&
          inOut[pos].format == nvinfer1::TensorFormat::kLINEAR;
 }
 
-void RoIAlignPluginDynamic::configurePlugin(
+void TRTRoIAlign::configurePlugin(
     const nvinfer1::DynamicPluginTensorDesc *inputs, int nbInputs,
     const nvinfer1::DynamicPluginTensorDesc *outputs, int nbOutputs) {}
 
-size_t RoIAlignPluginDynamic::getWorkspaceSize(
-    const nvinfer1::PluginTensorDesc *inputs, int nbInputs,
-    const nvinfer1::PluginTensorDesc *outputs, int nbOutputs) const {
+size_t TRTRoIAlign::getWorkspaceSize(const nvinfer1::PluginTensorDesc *inputs,
+                                     int nbInputs,
+                                     const nvinfer1::PluginTensorDesc *outputs,
+                                     int nbOutputs) const {
   size_t output_size = 0;
   size_t word_size = 0;
   switch (mPoolMode) {
@@ -95,11 +91,10 @@ size_t RoIAlignPluginDynamic::getWorkspaceSize(
   return 0;
 }
 
-int RoIAlignPluginDynamic::enqueue(const nvinfer1::PluginTensorDesc *inputDesc,
-                                   const nvinfer1::PluginTensorDesc *outputDesc,
-                                   const void *const *inputs,
-                                   void *const *outputs, void *workSpace,
-                                   cudaStream_t stream) {
+int TRTRoIAlign::enqueue(const nvinfer1::PluginTensorDesc *inputDesc,
+                         const nvinfer1::PluginTensorDesc *outputDesc,
+                         const void *const *inputs, void *const *outputs,
+                         void *workSpace, cudaStream_t stream) {
   int channels = inputDesc[0].dims.d[1];
   int height = inputDesc[0].dims.d[2];
   int width = inputDesc[0].dims.d[3];
@@ -125,7 +120,7 @@ int RoIAlignPluginDynamic::enqueue(const nvinfer1::PluginTensorDesc *inputDesc,
 
   switch (outputDesc[0].type) {
     case nvinfer1::DataType::kFLOAT:
-      TRTRoIAlignForwardCUDAKernelLauncher_float(
+      TRTRoIAlignForwardCUDAKernelLauncher<float>(
           (const float *)feat, (const float *)rois, (float *)output,
           (float *)argmax_y, (float *)argmax_x, output_size, channels, height,
           width, mOutHeight, mOutWidth, mSpatialScale, mSampleRatio, mPoolMode,
@@ -139,31 +134,25 @@ int RoIAlignPluginDynamic::enqueue(const nvinfer1::PluginTensorDesc *inputDesc,
   return 0;
 }
 
-nvinfer1::DataType RoIAlignPluginDynamic::getOutputDataType(
+nvinfer1::DataType TRTRoIAlign::getOutputDataType(
     int index, const nvinfer1::DataType *inputTypes, int nbInputs) const {
   return inputTypes[0];
 }
 
 // IPluginV2 Methods
-const char *RoIAlignPluginDynamic::getPluginType() const { return PLUGIN_NAME; }
-
-const char *RoIAlignPluginDynamic::getPluginVersion() const {
-  return PLUGIN_VERSION;
-}
-
-int RoIAlignPluginDynamic::getNbOutputs() const { return 1; }
+const char *TRTRoIAlign::getPluginType() const { return PLUGIN_NAME; }
 
-int RoIAlignPluginDynamic::initialize() { return 0; }
+const char *TRTRoIAlign::getPluginVersion() const { return PLUGIN_VERSION; }
 
-void RoIAlignPluginDynamic::terminate() {}
+int TRTRoIAlign::getNbOutputs() const { return 1; }
 
-size_t RoIAlignPluginDynamic::getSerializationSize() const {
+size_t TRTRoIAlign::getSerializationSize() const {
   return serialized_size(mOutWidth) + serialized_size(mOutHeight) +
          serialized_size(mSpatialScale) + serialized_size(mSampleRatio) +
          serialized_size(mPoolMode) + serialized_size(mAligned);
 }
 
-void RoIAlignPluginDynamic::serialize(void *buffer) const {
+void TRTRoIAlign::serialize(void *buffer) const {
   serialize_value(&buffer, mOutWidth);
   serialize_value(&buffer, mOutHeight);
   serialize_value(&buffer, mSpatialScale);
@@ -172,22 +161,7 @@ void RoIAlignPluginDynamic::serialize(void *buffer) const {
   serialize_value(&buffer, mAligned);
 }
 
-void RoIAlignPluginDynamic::destroy() {
-  // This gets called when the network containing plugin is destroyed
-  delete this;
-}
-
-void RoIAlignPluginDynamic::setPluginNamespace(const char *libNamespace) {
-  mNamespace = libNamespace;
-}
-
-const char *RoIAlignPluginDynamic::getPluginNamespace() const {
-  return mNamespace.c_str();
-}
-
-////////////////////// creator /////////////////////////////
-
-RoIAlignPluginDynamicCreator::RoIAlignPluginDynamicCreator() {
+TRTRoIAlignCreator::TRTRoIAlignCreator() {
   mPluginAttributes.emplace_back(nvinfer1::PluginField("output_height"));
   mPluginAttributes.emplace_back(nvinfer1::PluginField("output_width"));
   mPluginAttributes.emplace_back(nvinfer1::PluginField("spatial_scale"));
@@ -198,20 +172,13 @@ RoIAlignPluginDynamicCreator::RoIAlignPluginDynamicCreator() {
   mFC.fields = mPluginAttributes.data();
 }
 
-const char *RoIAlignPluginDynamicCreator::getPluginName() const {
-  return PLUGIN_NAME;
-}
+const char *TRTRoIAlignCreator::getPluginName() const { return PLUGIN_NAME; }
 
-const char *RoIAlignPluginDynamicCreator::getPluginVersion() const {
+const char *TRTRoIAlignCreator::getPluginVersion() const {
   return PLUGIN_VERSION;
 }
 
-const nvinfer1::PluginFieldCollection *
-RoIAlignPluginDynamicCreator::getFieldNames() {
-  return &mFC;
-}
-
-nvinfer1::IPluginV2 *RoIAlignPluginDynamicCreator::createPlugin(
+nvinfer1::IPluginV2 *TRTRoIAlignCreator::createPlugin(
     const char *name, const nvinfer1::PluginFieldCollection *fc) {
   int outWidth = 7;
   int outHeight = 7;
@@ -253,7 +220,7 @@ nvinfer1::IPluginV2 *RoIAlignPluginDynamicCreator::createPlugin(
         std::cout << "Unknown pool mode \"" << poolModeStr << "\"."
                   << std::endl;
       }
-      assert(poolMode >= 0);
+      ASSERT(poolMode >= 0);
     }
 
     if (field_name.compare("aligned") == 0) {
@@ -262,31 +229,22 @@ nvinfer1::IPluginV2 *RoIAlignPluginDynamicCreator::createPlugin(
     }
   }
 
-  assert(outHeight > 0);
-  assert(outWidth > 0);
-  assert(spatialScale > 0.);
-  assert(poolMode >= 0);
+  ASSERT(outHeight > 0);
+  ASSERT(outWidth > 0);
+  ASSERT(spatialScale > 0.);
+  ASSERT(poolMode >= 0);
 
-  RoIAlignPluginDynamic *plugin = new RoIAlignPluginDynamic(
-      name, outWidth, outHeight, spatialScale, sampleRatio, poolMode, aligned);
+  TRTRoIAlign *plugin = new TRTRoIAlign(name, outWidth, outHeight, spatialScale,
+                                        sampleRatio, poolMode, aligned);
   plugin->setPluginNamespace(getPluginNamespace());
   return plugin;
 }
 
-nvinfer1::IPluginV2 *RoIAlignPluginDynamicCreator::deserializePlugin(
+nvinfer1::IPluginV2 *TRTRoIAlignCreator::deserializePlugin(
     const char *name, const void *serialData, size_t serialLength) {
-  auto plugin = new RoIAlignPluginDynamic(name, serialData, serialLength);
+  auto plugin = new TRTRoIAlign(name, serialData, serialLength);
   plugin->setPluginNamespace(getPluginNamespace());
   return plugin;
 }
-
-void RoIAlignPluginDynamicCreator::setPluginNamespace(
-    const char *libNamespace) {
-  mNamespace = libNamespace;
-}
-
-const char *RoIAlignPluginDynamicCreator::getPluginNamespace() const {
-  return mNamespace.c_str();
-}
-
-REGISTER_TENSORRT_PLUGIN(RoIAlignPluginDynamicCreator);
+REGISTER_TENSORRT_PLUGIN(TRTRoIAlignCreator);
+}  // namespace mmlab
diff --git a/backend_ops/tensorrt/roi_align/trt_roi_align.hpp b/backend_ops/tensorrt/roi_align/trt_roi_align.hpp
index 00b536c18b..bc35d5d7b0 100644
--- a/backend_ops/tensorrt/roi_align/trt_roi_align.hpp
+++ b/backend_ops/tensorrt/roi_align/trt_roi_align.hpp
@@ -6,18 +6,16 @@
 #include <string>
 #include <vector>
 
-#include "trt_plugin_helper.hpp"
-
-class RoIAlignPluginDynamic : public nvinfer1::IPluginV2DynamicExt {
+#include "trt_plugin_base.hpp"
+namespace mmlab {
+class TRTRoIAlign : public TRTPluginBase {
  public:
-  RoIAlignPluginDynamic(const std::string &name, int outWidth, int outHeight,
-                        float spatialScale, int sampleRatio, int poolMode,
-                        bool aligned);
+  TRTRoIAlign(const std::string &name, int outWidth, int outHeight,
+              float spatialScale, int sampleRatio, int poolMode, bool aligned);
 
-  RoIAlignPluginDynamic(const std::string name, const void *data,
-                        size_t length);
+  TRTRoIAlign(const std::string name, const void *data, size_t length);
 
-  RoIAlignPluginDynamic() = delete;
+  TRTRoIAlign() = delete;
 
   // IPluginV2DynamicExt Methods
   nvinfer1::IPluginV2DynamicExt *clone() const override;
@@ -49,60 +47,31 @@ class RoIAlignPluginDynamic : public nvinfer1::IPluginV2DynamicExt {
   const char *getPluginType() const override;
   const char *getPluginVersion() const override;
   int getNbOutputs() const override;
-  int initialize() override;
-  void terminate() override;
   size_t getSerializationSize() const override;
   void serialize(void *buffer) const override;
-  void destroy() override;
-  void setPluginNamespace(const char *pluginNamespace) override;
-  const char *getPluginNamespace() const override;
 
  private:
-  const std::string mLayerName;
-  std::string mNamespace;
-
   int mOutWidth;
   int mOutHeight;
   float mSpatialScale;
   int mSampleRatio;
   int mPoolMode;  // 1:avg 0:max
   bool mAligned;
-
- protected:
-  // To prevent compiler warnings.
-  using nvinfer1::IPluginV2DynamicExt::canBroadcastInputAcrossBatch;
-  using nvinfer1::IPluginV2DynamicExt::configurePlugin;
-  using nvinfer1::IPluginV2DynamicExt::enqueue;
-  using nvinfer1::IPluginV2DynamicExt::getOutputDimensions;
-  using nvinfer1::IPluginV2DynamicExt::getWorkspaceSize;
-  using nvinfer1::IPluginV2DynamicExt::isOutputBroadcastAcrossBatch;
-  using nvinfer1::IPluginV2DynamicExt::supportsFormat;
 };
 
-class RoIAlignPluginDynamicCreator : public nvinfer1::IPluginCreator {
+class TRTRoIAlignCreator : public TRTPluginCreatorBase {
  public:
-  RoIAlignPluginDynamicCreator();
+  TRTRoIAlignCreator();
 
   const char *getPluginName() const override;
 
   const char *getPluginVersion() const override;
-
-  const nvinfer1::PluginFieldCollection *getFieldNames() override;
-
   nvinfer1::IPluginV2 *createPlugin(
       const char *name, const nvinfer1::PluginFieldCollection *fc) override;
 
   nvinfer1::IPluginV2 *deserializePlugin(const char *name,
                                          const void *serialData,
                                          size_t serialLength) override;
-
-  void setPluginNamespace(const char *pluginNamespace) override;
-
-  const char *getPluginNamespace() const override;
-
- private:
-  nvinfer1::PluginFieldCollection mFC;
-  std::vector<nvinfer1::PluginField> mPluginAttributes;
-  std::string mNamespace;
 };
+}  // namespace mmlab
 #endif  // TRT_ROI_ALIGN_HPP
diff --git a/backend_ops/tensorrt/roi_align/trt_roi_align_kernel.cu b/backend_ops/tensorrt/roi_align/trt_roi_align_kernel.cu
index f744c80ec8..fba2b14c20 100644
--- a/backend_ops/tensorrt/roi_align/trt_roi_align_kernel.cu
+++ b/backend_ops/tensorrt/roi_align/trt_roi_align_kernel.cu
@@ -1,5 +1,98 @@
 #include "common_cuda_helper.hpp"
-#include "trt_roi_align_kernel.cuh"
+#include "float.h"
+#include "trt_roi_align_kernel.hpp"
+
+/*** Forward ***/
+template <typename T>
+__global__ void roi_align_forward_cuda_kernel(
+    const int nthreads, const T* input, const T* rois, T* output, T* argmax_y,
+    T* argmax_x, const int pooled_height, const int pooled_width,
+    const T spatial_scale, const int sampling_ratio,
+    const int pool_mode,  // 0 - max pool, 1 - avg pool
+    const bool aligned, const int channels, const int height, const int width) {
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    // (n, c, ph, pw) is an element in the pooled output
+    int pw = index % pooled_width;
+    int ph = (index / pooled_width) % pooled_height;
+    int c = (index / pooled_width / pooled_height) % channels;
+    int n = index / pooled_width / pooled_height / channels;
+
+    const T* offset_rois = rois + n * 5;
+    int roi_batch_ind = offset_rois[0];
+
+    // Do not using rounding; this implementation detail is critical
+    T offset = aligned ? (T)0.5 : (T)0.0;
+    T roi_start_w = offset_rois[1] * spatial_scale - offset;
+    T roi_start_h = offset_rois[2] * spatial_scale - offset;
+    T roi_end_w = offset_rois[3] * spatial_scale - offset;
+    T roi_end_h = offset_rois[4] * spatial_scale - offset;
+
+    T roi_width = roi_end_w - roi_start_w;
+    T roi_height = roi_end_h - roi_start_h;
+    if (!aligned) {  // for backward-compatibility only
+      roi_width = max(roi_width, (T)1.);
+      roi_height = max(roi_height, (T)1.);
+    }
+
+    T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
+    T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);
+
+    const T* offset_input =
+        input + (roi_batch_ind * channels + c) * height * width;
+
+    // We use roi_bin_grid to sample the grid and mimic integral
+    int roi_bin_grid_h =
+        (sampling_ratio > 0)
+            ? sampling_ratio
+            : static_cast<int>(ceilf(roi_height / pooled_height));
+    int roi_bin_grid_w =
+        (sampling_ratio > 0)
+            ? sampling_ratio
+            : static_cast<int>(ceilf(roi_width / pooled_width));
+
+    if (pool_mode == 0) {
+      // We do max pooling inside a bin
+      T maxval = -FLT_MAX;
+      T maxidx_y = -1.f, maxidx_x = -1.f;
+      for (int iy = 0; iy < roi_bin_grid_h; iy++) {
+        const T y = roi_start_h + ph * bin_size_h +
+                    static_cast<T>(iy + .5f) * bin_size_h /
+                        static_cast<T>(roi_bin_grid_h);
+        for (int ix = 0; ix < roi_bin_grid_w; ix++) {
+          const T x = roi_start_w + pw * bin_size_w +
+                      static_cast<T>(ix + .5f) * bin_size_w /
+                          static_cast<T>(roi_bin_grid_w);
+          T val = bilinear_interpolate(offset_input, height, width, y, x);
+          if (val > maxval) {
+            maxval = val;
+            maxidx_y = y;
+            maxidx_x = x;
+          }
+        }
+      }
+      output[index] = maxval;
+      argmax_y[index] = maxidx_y;
+      argmax_x[index] = maxidx_x;
+    } else if (pool_mode == 1) {
+      // We do average pooling inside a bin
+      const T count = max(roi_bin_grid_h * roi_bin_grid_w, 1);
+      T output_val = 0.;
+      for (int iy = 0; iy < roi_bin_grid_h; iy++) {
+        const T y = roi_start_h + ph * bin_size_h +
+                    static_cast<T>(iy + .5f) * bin_size_h /
+                        static_cast<T>(roi_bin_grid_h);
+        for (int ix = 0; ix < roi_bin_grid_w; ix++) {
+          const T x = roi_start_w + pw * bin_size_w +
+                      static_cast<T>(ix + .5f) * bin_size_w /
+                          static_cast<T>(roi_bin_grid_w);
+          T val = bilinear_interpolate(offset_input, height, width, y, x);
+          output_val += val;
+        }
+      }
+      output[index] = output_val / count;
+    }
+  }
+}
 
 template <typename scalar_t>
 void TRTRoIAlignForwardCUDAKernelLauncher(
@@ -15,13 +108,8 @@ void TRTRoIAlignForwardCUDAKernelLauncher(
           pool_mode, aligned, channels, height, width);
 }
 
-void TRTRoIAlignForwardCUDAKernelLauncher_float(
+template void TRTRoIAlignForwardCUDAKernelLauncher<float>(
     const float* input, const float* rois, float* output, float* argmax_y,
     float* argmax_x, int output_size, int channels, int height, int width,
     int aligned_height, int aligned_width, float spatial_scale,
-    int sampling_ratio, int pool_mode, bool aligned, cudaStream_t stream) {
-  TRTRoIAlignForwardCUDAKernelLauncher<float>(
-      input, rois, output, argmax_y, argmax_x, output_size, channels, height,
-      width, aligned_height, aligned_width, spatial_scale, sampling_ratio,
-      pool_mode, aligned, stream);
-}
+    int sampling_ratio, int pool_mode, bool aligned, cudaStream_t stream);
diff --git a/backend_ops/tensorrt/roi_align/trt_roi_align_kernel.cuh b/backend_ops/tensorrt/roi_align/trt_roi_align_kernel.cuh
deleted file mode 100644
index c6f61d8941..0000000000
--- a/backend_ops/tensorrt/roi_align/trt_roi_align_kernel.cuh
+++ /dev/null
@@ -1,204 +0,0 @@
-#ifndef ROI_ALIGN_CUDA_KERNEL_CUH
-#define ROI_ALIGN_CUDA_KERNEL_CUH
-
-#include <float.h>
-
-#include "common_cuda_helper.hpp"
-
-/*** Forward ***/
-template <typename T>
-__global__ void roi_align_forward_cuda_kernel(
-    const int nthreads, const T* input, const T* rois, T* output, T* argmax_y,
-    T* argmax_x, const int pooled_height, const int pooled_width,
-    const T spatial_scale, const int sampling_ratio,
-    const int pool_mode,  // 0 - max pool, 1 - avg pool
-    const bool aligned, const int channels, const int height, const int width) {
-  CUDA_1D_KERNEL_LOOP(index, nthreads) {
-    // (n, c, ph, pw) is an element in the pooled output
-    int pw = index % pooled_width;
-    int ph = (index / pooled_width) % pooled_height;
-    int c = (index / pooled_width / pooled_height) % channels;
-    int n = index / pooled_width / pooled_height / channels;
-
-    const T* offset_rois = rois + n * 5;
-    int roi_batch_ind = offset_rois[0];
-
-    // Do not using rounding; this implementation detail is critical
-    T offset = aligned ? (T)0.5 : (T)0.0;
-    T roi_start_w = offset_rois[1] * spatial_scale - offset;
-    T roi_start_h = offset_rois[2] * spatial_scale - offset;
-    T roi_end_w = offset_rois[3] * spatial_scale - offset;
-    T roi_end_h = offset_rois[4] * spatial_scale - offset;
-
-    T roi_width = roi_end_w - roi_start_w;
-    T roi_height = roi_end_h - roi_start_h;
-    if (!aligned) {  // for backward-compatibility only
-      roi_width = max(roi_width, (T)1.);
-      roi_height = max(roi_height, (T)1.);
-    }
-
-    T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
-    T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);
-
-    const T* offset_input =
-        input + (roi_batch_ind * channels + c) * height * width;
-
-    // We use roi_bin_grid to sample the grid and mimic integral
-    int roi_bin_grid_h =
-        (sampling_ratio > 0)
-            ? sampling_ratio
-            : static_cast<int>(ceilf(roi_height / pooled_height));
-    int roi_bin_grid_w =
-        (sampling_ratio > 0)
-            ? sampling_ratio
-            : static_cast<int>(ceilf(roi_width / pooled_width));
-
-    if (pool_mode == 0) {
-      // We do max pooling inside a bin
-      T maxval = -FLT_MAX;
-      T maxidx_y = -1.f, maxidx_x = -1.f;
-      for (int iy = 0; iy < roi_bin_grid_h; iy++) {
-        const T y = roi_start_h + ph * bin_size_h +
-                    static_cast<T>(iy + .5f) * bin_size_h /
-                        static_cast<T>(roi_bin_grid_h);
-        for (int ix = 0; ix < roi_bin_grid_w; ix++) {
-          const T x = roi_start_w + pw * bin_size_w +
-                      static_cast<T>(ix + .5f) * bin_size_w /
-                          static_cast<T>(roi_bin_grid_w);
-          T val =
-              bilinear_interpolate(offset_input, height, width, y, x, index);
-          if (val > maxval) {
-            maxval = val;
-            maxidx_y = y;
-            maxidx_x = x;
-          }
-        }
-      }
-      output[index] = maxval;
-      argmax_y[index] = maxidx_y;
-      argmax_x[index] = maxidx_x;
-    } else if (pool_mode == 1) {
-      // We do average pooling inside a bin
-      const T count = max(roi_bin_grid_h * roi_bin_grid_w, 1);
-      T output_val = 0.;
-      for (int iy = 0; iy < roi_bin_grid_h; iy++) {
-        const T y = roi_start_h + ph * bin_size_h +
-                    static_cast<T>(iy + .5f) * bin_size_h /
-                        static_cast<T>(roi_bin_grid_h);
-        for (int ix = 0; ix < roi_bin_grid_w; ix++) {
-          const T x = roi_start_w + pw * bin_size_w +
-                      static_cast<T>(ix + .5f) * bin_size_w /
-                          static_cast<T>(roi_bin_grid_w);
-          T val =
-              bilinear_interpolate(offset_input, height, width, y, x, index);
-          output_val += val;
-        }
-      }
-      output[index] = output_val / count;
-    }
-  }
-}
-
-/*** Backward ***/
-template <typename T>
-__global__ void roi_align_backward_cuda_kernel(
-    const int nthreads, const T* grad_output, const T* rois, const T* argmax_y,
-    const T* argmax_x, T* grad_input, const int pooled_height,
-    const int pooled_width, const T spatial_scale, const int sampling_ratio,
-    const int pool_mode,  // 0 - max pool, 1 - avg pool
-    const bool aligned, const int channels, const int height, const int width) {
-  CUDA_1D_KERNEL_LOOP(index, nthreads) {
-    // (n, c, ph, pw) is an element in the pooled output
-    int pw = index % pooled_width;
-    int ph = (index / pooled_width) % pooled_height;
-    int c = (index / pooled_width / pooled_height) % channels;
-    int n = index / pooled_width / pooled_height / channels;
-
-    const T grad_output_this_bin = grad_output[index];
-
-    const T* offset_rois = rois + n * 5;
-    int roi_batch_ind = offset_rois[0];
-    T* offset_grad_input =
-        grad_input + ((roi_batch_ind * channels + c) * height * width);
-
-    if (pool_mode == 0) {
-      T y = argmax_y[index], x = argmax_x[index];
-      if (y != -1.f) {
-        T w1, w2, w3, w4;
-        int x_low, x_high, y_low, y_high;
-        bilinear_interpolate_gradient(height, width, y, x, w1, w2, w3, w4,
-                                      x_low, x_high, y_low, y_high, index);
-
-        if (x_low >= 0 && x_high >= 0 && y_low >= 0 && y_high >= 0) {
-          atomicAdd(offset_grad_input + y_low * width + x_low,
-                    grad_output_this_bin * w1);
-          atomicAdd(offset_grad_input + y_low * width + x_high,
-                    grad_output_this_bin * w2);
-          atomicAdd(offset_grad_input + y_high * width + x_low,
-                    grad_output_this_bin * w3);
-          atomicAdd(offset_grad_input + y_high * width + x_high,
-                    grad_output_this_bin * w4);
-        }
-      }
-    } else if (pool_mode == 1) {
-      // Do not using rounding; this implementation detail is critical
-      T offset = aligned ? (T)0.5 : (T)0.0;
-      T roi_start_w = offset_rois[1] * spatial_scale - offset;
-      T roi_start_h = offset_rois[2] * spatial_scale - offset;
-      T roi_end_w = offset_rois[3] * spatial_scale - offset;
-      T roi_end_h = offset_rois[4] * spatial_scale - offset;
-
-      T roi_width = roi_end_w - roi_start_w;
-      T roi_height = roi_end_h - roi_start_h;
-      if (!aligned) {  // for backward-compatibility only
-        roi_width = max(roi_width, (T)1.);
-        roi_height = max(roi_height, (T)1.);
-      }
-
-      T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
-      T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);
-
-      // We use roi_bin_grid to sample the grid and mimic integral
-      int roi_bin_grid_h =
-          (sampling_ratio > 0)
-              ? sampling_ratio
-              : static_cast<int>(ceilf(roi_height / pooled_height));
-      int roi_bin_grid_w =
-          (sampling_ratio > 0)
-              ? sampling_ratio
-              : static_cast<int>(ceilf(roi_width / pooled_width));
-
-      // We do average (integral) pooling inside a bin
-      const T count = roi_bin_grid_h * roi_bin_grid_w;  // e.g. = 4
-
-      for (int iy = 0; iy < roi_bin_grid_h; iy++) {
-        const T y = roi_start_h + ph * bin_size_h +
-                    static_cast<T>(iy + .5f) * bin_size_h /
-                        static_cast<T>(roi_bin_grid_h);
-        for (int ix = 0; ix < roi_bin_grid_w; ix++) {
-          const T x = roi_start_w + pw * bin_size_w +
-                      static_cast<T>(ix + .5f) * bin_size_w /
-                          static_cast<T>(roi_bin_grid_w);
-
-          T w1, w2, w3, w4;
-          int x_low, x_high, y_low, y_high;
-          bilinear_interpolate_gradient(height, width, y, x, w1, w2, w3, w4,
-                                        x_low, x_high, y_low, y_high, index);
-
-          if (x_low >= 0 && x_high >= 0 && y_low >= 0 && y_high >= 0) {
-            atomicAdd(offset_grad_input + y_low * width + x_low,
-                      grad_output_this_bin * w1 / count);
-            atomicAdd(offset_grad_input + y_low * width + x_high,
-                      grad_output_this_bin * w2 / count);
-            atomicAdd(offset_grad_input + y_high * width + x_low,
-                      grad_output_this_bin * w3 / count);
-            atomicAdd(offset_grad_input + y_high * width + x_high,
-                      grad_output_this_bin * w4 / count);
-          }
-        }
-      }
-    }
-  }
-}
-
-#endif  // ROI_ALIGN_CUDA_KERNEL_CUH
diff --git a/backend_ops/tensorrt/roi_align/trt_roi_align_kernel.hpp b/backend_ops/tensorrt/roi_align/trt_roi_align_kernel.hpp
new file mode 100644
index 0000000000..d5a8dd551c
--- /dev/null
+++ b/backend_ops/tensorrt/roi_align/trt_roi_align_kernel.hpp
@@ -0,0 +1,14 @@
+#ifndef ROI_ALIGN_CUDA_KERNEL_HPP
+#define ROI_ALIGN_CUDA_KERNEL_HPP
+
+#include "common_cuda_helper.hpp"
+
+template <typename scalar_t>
+void TRTRoIAlignForwardCUDAKernelLauncher(
+    const scalar_t* input, const scalar_t* rois, scalar_t* output,
+    scalar_t* argmax_y, scalar_t* argmax_x, int output_size, int channels,
+    int height, int width, int aligned_height, int aligned_width,
+    scalar_t spatial_scale, int sampling_ratio, int pool_mode, bool aligned,
+    cudaStream_t stream);
+
+#endif  // ROI_ALIGN_CUDA_KERNEL_HPP
diff --git a/backend_ops/tensorrt/scatternd/trt_scatternd.cpp b/backend_ops/tensorrt/scatternd/trt_scatternd.cpp
index 03e4d657f3..f5c85909f2 100644
--- a/backend_ops/tensorrt/scatternd/trt_scatternd.cpp
+++ b/backend_ops/tensorrt/scatternd/trt_scatternd.cpp
@@ -8,32 +8,32 @@
 #include "trt_scatternd_kernel.hpp"
 #include "trt_serialize.hpp"
 
+namespace mmlab {
 namespace {
 static const char *PLUGIN_VERSION{"1"};
 static const char *PLUGIN_NAME{"ScatterND"};
 }  // namespace
 
-ONNXScatterNDDynamic::ONNXScatterNDDynamic(const std::string &name)
-    : mLayerName(name) {}
+TRTScatterND::TRTScatterND(const std::string &name) : TRTPluginBase(name) {}
 
-ONNXScatterNDDynamic::ONNXScatterNDDynamic(const std::string name,
-                                           const void *data, size_t length)
-    : mLayerName(name) {}
+TRTScatterND::TRTScatterND(const std::string name, const void *data,
+                           size_t length)
+    : TRTPluginBase(name) {}
 
-nvinfer1::IPluginV2DynamicExt *ONNXScatterNDDynamic::clone() const {
-  ONNXScatterNDDynamic *plugin = new ONNXScatterNDDynamic(mLayerName);
+nvinfer1::IPluginV2DynamicExt *TRTScatterND::clone() const {
+  TRTScatterND *plugin = new TRTScatterND(mLayerName);
   plugin->setPluginNamespace(getPluginNamespace());
 
   return plugin;
 }
 
-nvinfer1::DimsExprs ONNXScatterNDDynamic::getOutputDimensions(
+nvinfer1::DimsExprs TRTScatterND::getOutputDimensions(
     int outputIndex, const nvinfer1::DimsExprs *inputs, int nbInputs,
     nvinfer1::IExprBuilder &exprBuilder) {
   return inputs[0];
 }
 
-bool ONNXScatterNDDynamic::supportsFormatCombination(
+bool TRTScatterND::supportsFormatCombination(
     int pos, const nvinfer1::PluginTensorDesc *inOut, int nbInputs,
     int nbOutputs) {
   if (pos < nbInputs) {
@@ -68,21 +68,21 @@ bool ONNXScatterNDDynamic::supportsFormatCombination(
   return true;
 }
 
-void ONNXScatterNDDynamic::configurePlugin(
+void TRTScatterND::configurePlugin(
     const nvinfer1::DynamicPluginTensorDesc *inputs, int nbInputs,
     const nvinfer1::DynamicPluginTensorDesc *outputs, int nbOutputs) {}
 
-size_t ONNXScatterNDDynamic::getWorkspaceSize(
-    const nvinfer1::PluginTensorDesc *inputs, int nbInputs,
-    const nvinfer1::PluginTensorDesc *outputs, int nbOutputs) const {
+size_t TRTScatterND::getWorkspaceSize(const nvinfer1::PluginTensorDesc *inputs,
+                                      int nbInputs,
+                                      const nvinfer1::PluginTensorDesc *outputs,
+                                      int nbOutputs) const {
   return 0;
 }
 
-int ONNXScatterNDDynamic::enqueue(const nvinfer1::PluginTensorDesc *inputDesc,
-                                  const nvinfer1::PluginTensorDesc *outputDesc,
-                                  const void *const *inputs,
-                                  void *const *outputs, void *workSpace,
-                                  cudaStream_t stream) {
+int TRTScatterND::enqueue(const nvinfer1::PluginTensorDesc *inputDesc,
+                          const nvinfer1::PluginTensorDesc *outputDesc,
+                          const void *const *inputs, void *const *outputs,
+                          void *workSpace, cudaStream_t stream) {
   const int *dims = &(inputDesc[0].dims.d[0]);
   const int *indices_dims = &(inputDesc[1].dims.d[0]);
   int nbDims = inputDesc[0].dims.nbDims;
@@ -114,82 +114,47 @@ int ONNXScatterNDDynamic::enqueue(const nvinfer1::PluginTensorDesc *inputDesc,
   return 0;
 }
 
-nvinfer1::DataType ONNXScatterNDDynamic::getOutputDataType(
+nvinfer1::DataType TRTScatterND::getOutputDataType(
     int index, const nvinfer1::DataType *inputTypes, int nbInputs) const {
   return inputTypes[0];
 }
 
 // IPluginV2 Methods
-const char *ONNXScatterNDDynamic::getPluginType() const { return PLUGIN_NAME; }
+const char *TRTScatterND::getPluginType() const { return PLUGIN_NAME; }
 
-const char *ONNXScatterNDDynamic::getPluginVersion() const {
-  return PLUGIN_VERSION;
-}
-
-int ONNXScatterNDDynamic::getNbOutputs() const { return 1; }
-
-int ONNXScatterNDDynamic::initialize() { return 0; }
-
-void ONNXScatterNDDynamic::terminate() {}
-
-size_t ONNXScatterNDDynamic::getSerializationSize() const { return 0; }
-
-void ONNXScatterNDDynamic::serialize(void *buffer) const {}
-
-void ONNXScatterNDDynamic::destroy() {
-  // This gets called when the network containing plugin is destroyed
-  delete this;
-}
+const char *TRTScatterND::getPluginVersion() const { return PLUGIN_VERSION; }
 
-void ONNXScatterNDDynamic::setPluginNamespace(const char *libNamespace) {
-  mNamespace = libNamespace;
-}
+int TRTScatterND::getNbOutputs() const { return 1; }
 
-const char *ONNXScatterNDDynamic::getPluginNamespace() const {
-  return mNamespace.c_str();
-}
+size_t TRTScatterND::getSerializationSize() const { return 0; }
 
-////////////////////// creator /////////////////////////////
+void TRTScatterND::serialize(void *buffer) const {}
 
-ONNXScatterNDDynamicCreator::ONNXScatterNDDynamicCreator() {
+TRTScatterNDCreator::TRTScatterNDCreator() {
   mPluginAttributes.clear();
   mFC.nbFields = mPluginAttributes.size();
   mFC.fields = mPluginAttributes.data();
 }
 
-const char *ONNXScatterNDDynamicCreator::getPluginName() const {
-  return PLUGIN_NAME;
-}
+const char *TRTScatterNDCreator::getPluginName() const { return PLUGIN_NAME; }
 
-const char *ONNXScatterNDDynamicCreator::getPluginVersion() const {
+const char *TRTScatterNDCreator::getPluginVersion() const {
   return PLUGIN_VERSION;
 }
 
-const nvinfer1::PluginFieldCollection *
-ONNXScatterNDDynamicCreator::getFieldNames() {
-  return &mFC;
-}
-
-nvinfer1::IPluginV2 *ONNXScatterNDDynamicCreator::createPlugin(
+nvinfer1::IPluginV2 *TRTScatterNDCreator::createPlugin(
     const char *name, const nvinfer1::PluginFieldCollection *fc) {
-  ONNXScatterNDDynamic *plugin = new ONNXScatterNDDynamic(name);
+  TRTScatterND *plugin = new TRTScatterND(name);
   plugin->setPluginNamespace(getPluginNamespace());
   return plugin;
 }
 
-nvinfer1::IPluginV2 *ONNXScatterNDDynamicCreator::deserializePlugin(
+nvinfer1::IPluginV2 *TRTScatterNDCreator::deserializePlugin(
     const char *name, const void *serialData, size_t serialLength) {
-  auto plugin = new ONNXScatterNDDynamic(name, serialData, serialLength);
+  auto plugin = new TRTScatterND(name, serialData, serialLength);
   plugin->setPluginNamespace(getPluginNamespace());
   return plugin;
 }
 
-void ONNXScatterNDDynamicCreator::setPluginNamespace(const char *libNamespace) {
-  mNamespace = libNamespace;
-}
-
-const char *ONNXScatterNDDynamicCreator::getPluginNamespace() const {
-  return mNamespace.c_str();
-}
-
-REGISTER_TENSORRT_PLUGIN(ONNXScatterNDDynamicCreator);
+REGISTER_TENSORRT_PLUGIN(TRTScatterNDCreator);
+}  // namespace mmlab
diff --git a/backend_ops/tensorrt/scatternd/trt_scatternd.hpp b/backend_ops/tensorrt/scatternd/trt_scatternd.hpp
index f64662c0f0..a5e08ce835 100644
--- a/backend_ops/tensorrt/scatternd/trt_scatternd.hpp
+++ b/backend_ops/tensorrt/scatternd/trt_scatternd.hpp
@@ -6,15 +6,16 @@
 #include <string>
 #include <vector>
 
-#include "trt_plugin_helper.hpp"
+#include "trt_plugin_base.hpp"
 
-class ONNXScatterNDDynamic : public nvinfer1::IPluginV2DynamicExt {
+namespace mmlab {
+class TRTScatterND : public TRTPluginBase {
  public:
-  ONNXScatterNDDynamic(const std::string &name);
+  TRTScatterND(const std::string &name);
 
-  ONNXScatterNDDynamic(const std::string name, const void *data, size_t length);
+  TRTScatterND(const std::string name, const void *data, size_t length);
 
-  ONNXScatterNDDynamic() = delete;
+  TRTScatterND() = delete;
 
   // IPluginV2DynamicExt Methods
   nvinfer1::IPluginV2DynamicExt *clone() const override;
@@ -46,53 +47,23 @@ class ONNXScatterNDDynamic : public nvinfer1::IPluginV2DynamicExt {
   const char *getPluginType() const override;
   const char *getPluginVersion() const override;
   int getNbOutputs() const override;
-  int initialize() override;
-  void terminate() override;
   size_t getSerializationSize() const override;
   void serialize(void *buffer) const override;
-  void destroy() override;
-  void setPluginNamespace(const char *pluginNamespace) override;
-  const char *getPluginNamespace() const override;
-
- private:
-  const std::string mLayerName;
-  std::string mNamespace;
-
- protected:
-  // To prevent compiler warnings.
-  using nvinfer1::IPluginV2DynamicExt::canBroadcastInputAcrossBatch;
-  using nvinfer1::IPluginV2DynamicExt::configurePlugin;
-  using nvinfer1::IPluginV2DynamicExt::enqueue;
-  using nvinfer1::IPluginV2DynamicExt::getOutputDimensions;
-  using nvinfer1::IPluginV2DynamicExt::getWorkspaceSize;
-  using nvinfer1::IPluginV2DynamicExt::isOutputBroadcastAcrossBatch;
-  using nvinfer1::IPluginV2DynamicExt::supportsFormat;
 };
 
-class ONNXScatterNDDynamicCreator : public nvinfer1::IPluginCreator {
+class TRTScatterNDCreator : public TRTPluginCreatorBase {
  public:
-  ONNXScatterNDDynamicCreator();
+  TRTScatterNDCreator();
 
   const char *getPluginName() const override;
 
   const char *getPluginVersion() const override;
-
-  const nvinfer1::PluginFieldCollection *getFieldNames() override;
-
   nvinfer1::IPluginV2 *createPlugin(
       const char *name, const nvinfer1::PluginFieldCollection *fc) override;
 
   nvinfer1::IPluginV2 *deserializePlugin(const char *name,
                                          const void *serialData,
                                          size_t serialLength) override;
-
-  void setPluginNamespace(const char *pluginNamespace) override;
-
-  const char *getPluginNamespace() const override;
-
- private:
-  nvinfer1::PluginFieldCollection mFC;
-  std::vector<nvinfer1::PluginField> mPluginAttributes;
-  std::string mNamespace;
 };
+}  // namespace mmlab
 #endif  // TRT_SCATTERND_HPP
diff --git a/backend_ops/tensorrt/scatternd/trt_scatternd_kernel.cu b/backend_ops/tensorrt/scatternd/trt_scatternd_kernel.cu
index d7b59d0c10..2adda61485 100644
--- a/backend_ops/tensorrt/scatternd/trt_scatternd_kernel.cu
+++ b/backend_ops/tensorrt/scatternd/trt_scatternd_kernel.cu
@@ -3,7 +3,6 @@
 #include <vector>
 
 #include "common_cuda_helper.hpp"
-#include "trt_cuda_helper.cuh"
 #include "trt_plugin_helper.hpp"
 
 using mmlab::TensorDesc;