From 0ff79a2930699dc230a07426ce3e79d8d686d810 Mon Sep 17 00:00:00 2001
From: Rafal Banas <rbanas@nvidia.com>
Date: Mon, 7 Oct 2024 14:04:50 +0200
Subject: [PATCH] Improve performance of experimental.resize

Signed-off-by: Rafal Banas <rbanas@nvidia.com>
---
 .../experimental/resize_op_impl_cvcuda.h      | 80 +++++++------------
 dali/operators/nvcvop/nvcvop.cc               | 54 +++++++++++--
 dali/operators/nvcvop/nvcvop.h                | 26 +++---
 3 files changed, 90 insertions(+), 70 deletions(-)
diff --git a/dali/operators/image/resize/experimental/resize_op_impl_cvcuda.h b/dali/operators/image/resize/experimental/resize_op_impl_cvcuda.h
index 285365fd2b5..08e75af6037 100644
--- a/dali/operators/image/resize/experimental/resize_op_impl_cvcuda.h
+++ b/dali/operators/image/resize/experimental/resize_op_impl_cvcuda.h
@@ -77,9 +77,9 @@ class ResizeOpImplCvCuda : public ResizeBase<GPUBackend>::Impl {
   }
 
   void SetupKernel() {
-    kernels::KernelContext ctx;
     rois_.resize(total_frames_);
-    workspace_reqs_ = {};
+    workspace_reqs_[0] = {};
+    workspace_reqs_[1] = {};
     std::vector<HQResizeTensorShapeI> mb_input_shapes(minibatch_size_);
     std::vector<HQResizeTensorShapeI> mb_output_shapes(minibatch_size_);
     auto *rois_ptr = rois_.data();
@@ -111,7 +111,7 @@ class ResizeOpImplCvCuda : public ResizeBase<GPUBackend>::Impl {
       auto ws_req = resize_op_.getWorkspaceRequirements(mb.count, mb_input_shape, mb_output_shape,
                                                         mb.min_interpolation, mb.mag_interpolation,
                                                         mb.antialias, mb.rois);
-      workspace_reqs_ = nvcvop::MaxWorkspaceRequirements(workspace_reqs_, ws_req);
+      workspace_reqs_[mb_idx % 2] = cvcuda::MaxWorkspaceReq(workspace_reqs_[mb_idx % 2], ws_req);
     }
   }
 
@@ -146,28 +146,38 @@ class ResizeOpImplCvCuda : public ResizeBase<GPUBackend>::Impl {
 
   void RunResize(Workspace &ws, TensorList<GPUBackend> &output,
                  const TensorList<GPUBackend> &input) override {
-    TensorList<GPUBackend> in_frames;
-    in_frames.ShareData(input);
-    in_frames.Resize(in_shape_);
-    PrepareInput(in_frames);
-
-    TensorList<GPUBackend> out_frames;
-    out_frames.ShareData(output);
-    out_frames.Resize(out_shape_);
-    PrepareOutput(out_frames);
+    kernels::DynamicScratchpad scratchpad({}, AccessOrder(ws.stream()));
+    auto allocator = nvcvop::GetScratchpadAllocator(scratchpad);
 
+    in_frames_.ShareData(input);
+    in_frames_.Resize(in_shape_);
 
-    kernels::DynamicScratchpad scratchpad({}, AccessOrder(ws.stream()));
+    out_frames_.ShareData(output);
+    out_frames_.Resize(out_shape_);
 
-    auto workspace_mem = op_workspace_.Allocate(workspace_reqs_, scratchpad);
+    auto workspace_mem = AllocateWorkspaces(scratchpad);
 
     for (size_t b = 0; b < minibatches_.size(); b++) {
       MiniBatch &mb = minibatches_[b];
-      resize_op_(ws.stream(), workspace_mem, mb.input, mb.output, mb.min_interpolation,
+      auto reqs = nvcv::TensorBatch::CalcRequirements(mb.count);
+      auto mb_output = nvcv::TensorBatch(reqs, allocator);
+      auto mb_input = nvcv::TensorBatch(reqs, allocator);
+      nvcvop::PushTensorsToBatch(mb_input, in_frames_, mb.start, mb.count, sample_layout_);
+      nvcvop::PushTensorsToBatch(mb_output, out_frames_, mb.start, mb.count, sample_layout_);
+      resize_op_(ws.stream(), workspace_mem[b % 2], mb_input, mb_output, mb.min_interpolation,
                  mb.mag_interpolation, mb.antialias, mb.rois);
     }
   }
 
+  std::array<cvcuda::Workspace, 2> AllocateWorkspaces(kernels::Scratchpad &scratchpad) {
+    std::array<cvcuda::Workspace, 2> result;
+    result[0] = op_workspace_.Allocate(workspace_reqs_[0], scratchpad);
+    if (minibatches_.size() > 1) {
+      result[1] = op_workspace_.Allocate(workspace_reqs_[1], scratchpad);
+    }
+    return result;
+  }
+
   void CalculateMinibatchPartition(int minibatch_size) {
     std::vector<std::pair<int, int>> continuous_ranges;
     kernels::FilterDesc min_filter_desc = params_[frame_idx(0)][0].min_filter;
@@ -210,14 +220,15 @@ class ResizeOpImplCvCuda : public ResizeBase<GPUBackend>::Impl {
 
   cvcuda::HQResize resize_op_{};
   nvcvop::NVCVOpWorkspace op_workspace_;
-  cvcuda::WorkspaceRequirements workspace_reqs_{};
+  std::array<cvcuda::WorkspaceRequirements, 2> workspace_reqs_{};
   std::vector<HQResizeRoiF> rois_;
   const TensorLayout sample_layout_ = (spatial_ndim == 2) ? "HWC" : "DHWC";
 
+  TensorList<GPUBackend> in_frames_;
+  TensorList<GPUBackend> out_frames_;
+
   struct MiniBatch {
     int start, count;
-    nvcv::TensorBatch input;
-    nvcv::TensorBatch output;
     NVCVInterpolationType min_interpolation;
     NVCVInterpolationType mag_interpolation;
     bool antialias;
@@ -225,39 +236,6 @@ class ResizeOpImplCvCuda : public ResizeBase<GPUBackend>::Impl {
   };
 
   std::vector<MiniBatch> minibatches_;
-
-  void PrepareInput(const TensorList<GPUBackend> &input) {
-    for (auto &mb : minibatches_) {
-      int curr_capacity = mb.input ? mb.input.capacity() : 0;
-      if (mb.count > curr_capacity) {
-        int new_capacity = std::max(mb.count, curr_capacity * 2);
-        auto reqs = nvcv::TensorBatch::CalcRequirements(new_capacity);
-        mb.input = nvcv::TensorBatch(reqs);
-      } else {
-        mb.input.clear();
-      }
-      for (int i = mb.start; i < mb.start + mb.count; ++i) {
-        mb.input.pushBack(nvcvop::AsTensor(input[frame_idx(i)], sample_layout_));
-      }
-    }
-  }
-
-  void PrepareOutput(const TensorList<GPUBackend> &out) {
-    for (auto &mb : minibatches_) {
-      int curr_capacity = mb.output ? mb.output.capacity() : 0;
-      if (mb.count > curr_capacity) {
-        int new_capacity = std::max(mb.count, curr_capacity * 2);
-        auto reqs = nvcv::TensorBatch::CalcRequirements(new_capacity);
-        mb.output = nvcv::TensorBatch(reqs);
-      } else {
-        mb.output.clear();
-      }
-      for (int i = mb.start; i < mb.start + mb.count; ++i) {
-        mb.output.pushBack(nvcvop::AsTensor(out[frame_idx(i)], sample_layout_));
-      }
-    }
-  }
-
   int minibatch_size_;
 };
 
diff --git a/dali/operators/nvcvop/nvcvop.cc b/dali/operators/nvcvop/nvcvop.cc
index a82982eb71a..ae327cbd0a9 100644
--- a/dali/operators/nvcvop/nvcvop.cc
+++ b/dali/operators/nvcvop/nvcvop.cc
@@ -14,8 +14,8 @@
 
 #include "dali/operators/nvcvop/nvcvop.h"
 
-
 #include <string>
+#include <utility>
 
 namespace dali::nvcvop {
 
@@ -208,7 +208,7 @@ nvcv::Tensor AsTensor(ConstSampleView<GPUBackend> sample, TensorLayout layout,
   return AsTensor(const_cast<void *>(sample.raw_data()), shape, sample.type(), layout);
 }
 
-nvcv::Tensor AsTensor(void *data, const TensorShape<> shape, DALIDataType daliDType,
+nvcv::Tensor AsTensor(void *data, const TensorShape<> &shape, DALIDataType daliDType,
                       TensorLayout layout) {
   auto dtype = GetDataType(daliDType, 1);
   nvcv::TensorDataStridedCuda::Buffer inBuf;
@@ -225,11 +225,38 @@ nvcv::Tensor AsTensor(void *data, const TensorShape<> shape, DALIDataType daliDT
   return nvcv::TensorWrapData(inData);
 }
 
+nvcv::Tensor AsTensor(const void *data, span<const int64_t> shape_data, const nvcv::DataType &dtype,
+                      const nvcv::TensorLayout &layout) {
+  int ndim = shape_data.size();
+  nvcv::TensorDataStridedCuda::Buffer inBuf;
+  inBuf.basePtr = reinterpret_cast<NVCVByte *>(const_cast<void *>(data));
+  inBuf.strides[ndim - 1] = dtype.strideBytes();
+  for (int d = ndim - 2; d >= 0; --d) {
+    inBuf.strides[d] = shape_data[d + 1] * inBuf.strides[d + 1];
+  }
+  nvcv::TensorShape out_shape(shape_data.data(), ndim, layout);
+  nvcv::TensorDataStridedCuda inData(out_shape, dtype, inBuf);
+  return nvcv::TensorWrapData(inData);
+}
+
+
 void PushTensorsToBatch(nvcv::TensorBatch &batch, const TensorList<GPUBackend> &t_list,
-                        TensorLayout layout) {
-  for (int s = 0; s < t_list.num_samples(); ++s) {
-    batch.pushBack(AsTensor(t_list[s], layout));
+                        int64_t start, int64_t count, const TensorLayout &layout) {
+  int ndim = t_list.sample_dim();
+  auto dtype = GetDataType(t_list.type(), 1);
+  TensorLayout out_layout = layout.empty() ? t_list.GetLayout() : layout;
+  DALI_ENFORCE(
+      out_layout.empty() || out_layout.size() == ndim,
+      make_string("Layout ", out_layout, " does not match the number of dimensions: ", ndim));
+  auto nvcv_layout = nvcv::TensorLayout(out_layout.c_str());
+  std::vector<nvcv::Tensor> tensors;
+  tensors.reserve(count);
+
+  for (int s = 0; s < count; ++s) {
+    tensors.push_back(AsTensor(t_list.raw_tensor(s + start), t_list.tensor_shape_span(s + start),
+                               dtype, nvcv_layout));
   }
+  batch.pushBack(tensors.begin(), tensors.end());
 }
 
 cvcuda::Workspace NVCVOpWorkspace::Allocate(const cvcuda::WorkspaceRequirements &reqs,
@@ -248,4 +275,21 @@ cvcuda::Workspace NVCVOpWorkspace::Allocate(const cvcuda::WorkspaceRequirements
   return workspace_;
 }
 
+nvcv::Allocator GetScratchpadAllocator(kernels::Scratchpad &scratchpad) {
+  auto hostAllocator = nvcv::CustomHostMemAllocator(
+      [&](int64_t size, int32_t align) { return scratchpad.AllocateHost<uint8_t>(size, align); },
+      [](void *, int64_t, int32_t) {});
+
+  auto pinnedAllocator = nvcv::CustomHostPinnedMemAllocator(
+      [&](int64_t size, int32_t align) { return scratchpad.AllocatePinned<uint8_t>(size, align); },
+      [](void *, int64_t, int32_t) {});
+
+  auto gpuAllocator = nvcv::CustomCudaMemAllocator(
+      [&](int64_t size, int32_t align) { return scratchpad.AllocateGPU<uint8_t>(size, align); },
+      [](void *, int64_t, int32_t) {});
+
+  return nvcv::CustomAllocator(std::move(hostAllocator), std::move(pinnedAllocator),
+                               std::move(gpuAllocator));
+}
+
 }  // namespace dali::nvcvop
diff --git a/dali/operators/nvcvop/nvcvop.h b/dali/operators/nvcvop/nvcvop.h
index 998fd9ba108..c4e61161c2d 100644
--- a/dali/operators/nvcvop/nvcvop.h
+++ b/dali/operators/nvcvop/nvcvop.h
@@ -18,6 +18,7 @@
 #include <nvcv/DataType.h>
 #include <nvcv/BorderType.h>
 #include <cvcuda/Types.h>
+#include <nvcv/alloc/Allocator.hpp>
 #include <cvcuda/Workspace.hpp>
 #include <nvcv/Tensor.hpp>
 #include <nvcv/TensorBatch.hpp>
@@ -34,6 +35,7 @@
 #include "dali/pipeline/operator/sequence_operator.h"
 #include "dali/core/cuda_event_pool.h"
 
+
 namespace dali::nvcvop {
 
 /**
@@ -112,7 +114,7 @@ nvcv::Tensor AsTensor(SampleView<GPUBackend> sample, TensorLayout layout = "",
 nvcv::Tensor AsTensor(ConstSampleView<GPUBackend> sample, TensorLayout layout = "",
                       const std::optional<TensorShape<>> &reshape = std::nullopt);
 
-nvcv::Tensor AsTensor(void *data, const TensorShape<> shape, DALIDataType dtype,
+nvcv::Tensor AsTensor(void *data, const TensorShape<> &shape, DALIDataType dtype,
                       TensorLayout layout);
 
 /**
@@ -132,9 +134,12 @@ void AllocateImagesLike(nvcv::ImageBatchVarShape &output, const TensorList<GPUBa
  */
 void PushImagesToBatch(nvcv::ImageBatchVarShape &batch, const TensorList<GPUBackend> &t_list);
 
-
+/**
+ * @brief Push samples from a given tensor list to a given TensorBatch.
+ * [start, start+count) determines the range of samples in the TensorList that will be used.
+ */
 void PushTensorsToBatch(nvcv::TensorBatch &batch, const TensorList<GPUBackend> &t_list,
-                        TensorLayout layout);
+                        int64_t start, int64_t count, const TensorLayout &layout);
 
 class NVCVOpWorkspace {
  public:
@@ -165,17 +170,10 @@ class NVCVOpWorkspace {
   int device_id_{};
 };
 
-inline cvcuda::WorkspaceRequirements MaxWorkspaceRequirements(
-    const cvcuda::WorkspaceRequirements &a, const cvcuda::WorkspaceRequirements &b) {
-  cvcuda::WorkspaceRequirements max;
-  max.hostMem.size = std::max(a.hostMem.size, b.hostMem.size);
-  max.hostMem.alignment = std::max(a.hostMem.alignment, b.hostMem.alignment);
-  max.pinnedMem.size = std::max(a.pinnedMem.size, b.pinnedMem.size);
-  max.pinnedMem.alignment = std::max(a.pinnedMem.alignment, b.pinnedMem.alignment);
-  max.cudaMem.size = std::max(a.cudaMem.size, b.cudaMem.size);
-  max.cudaMem.alignment = std::max(a.cudaMem.alignment, b.cudaMem.alignment);
-  return max;
-}
+/**
+ * @brief Create an NVCV allocator using the given scratchpad.
+ */
+nvcv::Allocator GetScratchpadAllocator(kernels::Scratchpad &scratchpad);
 
 /**
  * @brief A base class for the CVCUDA operators.