From 0ff79a2930699dc230a07426ce3e79d8d686d810 Mon Sep 17 00:00:00 2001 From: Rafal Banas Date: Mon, 7 Oct 2024 14:04:50 +0200 Subject: [PATCH] Improve performance of experimental.resize Signed-off-by: Rafal Banas --- .../experimental/resize_op_impl_cvcuda.h | 80 +++++++------------ dali/operators/nvcvop/nvcvop.cc | 54 +++++++++++-- dali/operators/nvcvop/nvcvop.h | 26 +++--- 3 files changed, 90 insertions(+), 70 deletions(-) diff --git a/dali/operators/image/resize/experimental/resize_op_impl_cvcuda.h b/dali/operators/image/resize/experimental/resize_op_impl_cvcuda.h index 285365fd2b5..08e75af6037 100644 --- a/dali/operators/image/resize/experimental/resize_op_impl_cvcuda.h +++ b/dali/operators/image/resize/experimental/resize_op_impl_cvcuda.h @@ -77,9 +77,9 @@ class ResizeOpImplCvCuda : public ResizeBase::Impl { } void SetupKernel() { - kernels::KernelContext ctx; rois_.resize(total_frames_); - workspace_reqs_ = {}; + workspace_reqs_[0] = {}; + workspace_reqs_[1] = {}; std::vector mb_input_shapes(minibatch_size_); std::vector mb_output_shapes(minibatch_size_); auto *rois_ptr = rois_.data(); @@ -111,7 +111,7 @@ class ResizeOpImplCvCuda : public ResizeBase::Impl { auto ws_req = resize_op_.getWorkspaceRequirements(mb.count, mb_input_shape, mb_output_shape, mb.min_interpolation, mb.mag_interpolation, mb.antialias, mb.rois); - workspace_reqs_ = nvcvop::MaxWorkspaceRequirements(workspace_reqs_, ws_req); + workspace_reqs_[mb_idx % 2] = cvcuda::MaxWorkspaceReq(workspace_reqs_[mb_idx % 2], ws_req); } } @@ -146,28 +146,38 @@ class ResizeOpImplCvCuda : public ResizeBase::Impl { void RunResize(Workspace &ws, TensorList &output, const TensorList &input) override { - TensorList in_frames; - in_frames.ShareData(input); - in_frames.Resize(in_shape_); - PrepareInput(in_frames); - - TensorList out_frames; - out_frames.ShareData(output); - out_frames.Resize(out_shape_); - PrepareOutput(out_frames); + kernels::DynamicScratchpad scratchpad({}, AccessOrder(ws.stream())); + auto allocator = nvcvop::GetScratchpadAllocator(scratchpad); + in_frames_.ShareData(input); + in_frames_.Resize(in_shape_); - kernels::DynamicScratchpad scratchpad({}, AccessOrder(ws.stream())); + out_frames_.ShareData(output); + out_frames_.Resize(out_shape_); - auto workspace_mem = op_workspace_.Allocate(workspace_reqs_, scratchpad); + auto workspace_mem = AllocateWorkspaces(scratchpad); for (size_t b = 0; b < minibatches_.size(); b++) { MiniBatch &mb = minibatches_[b]; - resize_op_(ws.stream(), workspace_mem, mb.input, mb.output, mb.min_interpolation, + auto reqs = nvcv::TensorBatch::CalcRequirements(mb.count); + auto mb_output = nvcv::TensorBatch(reqs, allocator); + auto mb_input = nvcv::TensorBatch(reqs, allocator); + nvcvop::PushTensorsToBatch(mb_input, in_frames_, mb.start, mb.count, sample_layout_); + nvcvop::PushTensorsToBatch(mb_output, out_frames_, mb.start, mb.count, sample_layout_); + resize_op_(ws.stream(), workspace_mem[b % 2], mb_input, mb_output, mb.min_interpolation, mb.mag_interpolation, mb.antialias, mb.rois); } } + std::array AllocateWorkspaces(kernels::Scratchpad &scratchpad) { + std::array result; + result[0] = op_workspace_.Allocate(workspace_reqs_[0], scratchpad); + if (minibatches_.size() > 1) { + result[1] = op_workspace_.Allocate(workspace_reqs_[1], scratchpad); + } + return result; + } + void CalculateMinibatchPartition(int minibatch_size) { std::vector> continuous_ranges; kernels::FilterDesc min_filter_desc = params_[frame_idx(0)][0].min_filter; @@ -210,14 +220,15 @@ class ResizeOpImplCvCuda : public ResizeBase::Impl { cvcuda::HQResize resize_op_{}; nvcvop::NVCVOpWorkspace op_workspace_; - cvcuda::WorkspaceRequirements workspace_reqs_{}; + std::array workspace_reqs_{}; std::vector rois_; const TensorLayout sample_layout_ = (spatial_ndim == 2) ? "HWC" : "DHWC"; + TensorList in_frames_; + TensorList out_frames_; + struct MiniBatch { int start, count; - nvcv::TensorBatch input; - nvcv::TensorBatch output; NVCVInterpolationType min_interpolation; NVCVInterpolationType mag_interpolation; bool antialias; @@ -225,39 +236,6 @@ class ResizeOpImplCvCuda : public ResizeBase::Impl { }; std::vector minibatches_; - - void PrepareInput(const TensorList &input) { - for (auto &mb : minibatches_) { - int curr_capacity = mb.input ? mb.input.capacity() : 0; - if (mb.count > curr_capacity) { - int new_capacity = std::max(mb.count, curr_capacity * 2); - auto reqs = nvcv::TensorBatch::CalcRequirements(new_capacity); - mb.input = nvcv::TensorBatch(reqs); - } else { - mb.input.clear(); - } - for (int i = mb.start; i < mb.start + mb.count; ++i) { - mb.input.pushBack(nvcvop::AsTensor(input[frame_idx(i)], sample_layout_)); - } - } - } - - void PrepareOutput(const TensorList &out) { - for (auto &mb : minibatches_) { - int curr_capacity = mb.output ? mb.output.capacity() : 0; - if (mb.count > curr_capacity) { - int new_capacity = std::max(mb.count, curr_capacity * 2); - auto reqs = nvcv::TensorBatch::CalcRequirements(new_capacity); - mb.output = nvcv::TensorBatch(reqs); - } else { - mb.output.clear(); - } - for (int i = mb.start; i < mb.start + mb.count; ++i) { - mb.output.pushBack(nvcvop::AsTensor(out[frame_idx(i)], sample_layout_)); - } - } - } - int minibatch_size_; }; diff --git a/dali/operators/nvcvop/nvcvop.cc b/dali/operators/nvcvop/nvcvop.cc index a82982eb71a..ae327cbd0a9 100644 --- a/dali/operators/nvcvop/nvcvop.cc +++ b/dali/operators/nvcvop/nvcvop.cc @@ -14,8 +14,8 @@ #include "dali/operators/nvcvop/nvcvop.h" - #include +#include namespace dali::nvcvop { @@ -208,7 +208,7 @@ nvcv::Tensor AsTensor(ConstSampleView sample, TensorLayout layout, return AsTensor(const_cast(sample.raw_data()), shape, sample.type(), layout); } -nvcv::Tensor AsTensor(void *data, const TensorShape<> shape, DALIDataType daliDType, +nvcv::Tensor AsTensor(void *data, const TensorShape<> &shape, DALIDataType daliDType, TensorLayout layout) { auto dtype = GetDataType(daliDType, 1); nvcv::TensorDataStridedCuda::Buffer inBuf; @@ -225,11 +225,38 @@ nvcv::Tensor AsTensor(void *data, const TensorShape<> shape, DALIDataType daliDT return nvcv::TensorWrapData(inData); } +nvcv::Tensor AsTensor(const void *data, span shape_data, const nvcv::DataType &dtype, + const nvcv::TensorLayout &layout) { + int ndim = shape_data.size(); + nvcv::TensorDataStridedCuda::Buffer inBuf; + inBuf.basePtr = reinterpret_cast(const_cast(data)); + inBuf.strides[ndim - 1] = dtype.strideBytes(); + for (int d = ndim - 2; d >= 0; --d) { + inBuf.strides[d] = shape_data[d + 1] * inBuf.strides[d + 1]; + } + nvcv::TensorShape out_shape(shape_data.data(), ndim, layout); + nvcv::TensorDataStridedCuda inData(out_shape, dtype, inBuf); + return nvcv::TensorWrapData(inData); +} + + void PushTensorsToBatch(nvcv::TensorBatch &batch, const TensorList &t_list, - TensorLayout layout) { - for (int s = 0; s < t_list.num_samples(); ++s) { - batch.pushBack(AsTensor(t_list[s], layout)); + int64_t start, int64_t count, const TensorLayout &layout) { + int ndim = t_list.sample_dim(); + auto dtype = GetDataType(t_list.type(), 1); + TensorLayout out_layout = layout.empty() ? t_list.GetLayout() : layout; + DALI_ENFORCE( + out_layout.empty() || out_layout.size() == ndim, + make_string("Layout ", out_layout, " does not match the number of dimensions: ", ndim)); + auto nvcv_layout = nvcv::TensorLayout(out_layout.c_str()); + std::vector tensors; + tensors.reserve(count); + + for (int s = 0; s < count; ++s) { + tensors.push_back(AsTensor(t_list.raw_tensor(s + start), t_list.tensor_shape_span(s + start), + dtype, nvcv_layout)); } + batch.pushBack(tensors.begin(), tensors.end()); } cvcuda::Workspace NVCVOpWorkspace::Allocate(const cvcuda::WorkspaceRequirements &reqs, @@ -248,4 +275,21 @@ cvcuda::Workspace NVCVOpWorkspace::Allocate(const cvcuda::WorkspaceRequirements return workspace_; } +nvcv::Allocator GetScratchpadAllocator(kernels::Scratchpad &scratchpad) { + auto hostAllocator = nvcv::CustomHostMemAllocator( + [&](int64_t size, int32_t align) { return scratchpad.AllocateHost(size, align); }, + [](void *, int64_t, int32_t) {}); + + auto pinnedAllocator = nvcv::CustomHostPinnedMemAllocator( + [&](int64_t size, int32_t align) { return scratchpad.AllocatePinned(size, align); }, + [](void *, int64_t, int32_t) {}); + + auto gpuAllocator = nvcv::CustomCudaMemAllocator( + [&](int64_t size, int32_t align) { return scratchpad.AllocateGPU(size, align); }, + [](void *, int64_t, int32_t) {}); + + return nvcv::CustomAllocator(std::move(hostAllocator), std::move(pinnedAllocator), + std::move(gpuAllocator)); +} + } // namespace dali::nvcvop diff --git a/dali/operators/nvcvop/nvcvop.h b/dali/operators/nvcvop/nvcvop.h index 998fd9ba108..c4e61161c2d 100644 --- a/dali/operators/nvcvop/nvcvop.h +++ b/dali/operators/nvcvop/nvcvop.h @@ -18,6 +18,7 @@ #include #include #include +#include #include #include #include @@ -34,6 +35,7 @@ #include "dali/pipeline/operator/sequence_operator.h" #include "dali/core/cuda_event_pool.h" + namespace dali::nvcvop { /** @@ -112,7 +114,7 @@ nvcv::Tensor AsTensor(SampleView sample, TensorLayout layout = "", nvcv::Tensor AsTensor(ConstSampleView sample, TensorLayout layout = "", const std::optional> &reshape = std::nullopt); -nvcv::Tensor AsTensor(void *data, const TensorShape<> shape, DALIDataType dtype, +nvcv::Tensor AsTensor(void *data, const TensorShape<> &shape, DALIDataType dtype, TensorLayout layout); /** @@ -132,9 +134,12 @@ void AllocateImagesLike(nvcv::ImageBatchVarShape &output, const TensorList &t_list); - +/** + * @brief Push samples from a given tensor list to a given TensorBatch. + * [start, start+count) determines the range of samples in the TensorList that will be used. + */ void PushTensorsToBatch(nvcv::TensorBatch &batch, const TensorList &t_list, - TensorLayout layout); + int64_t start, int64_t count, const TensorLayout &layout); class NVCVOpWorkspace { public: @@ -165,17 +170,10 @@ class NVCVOpWorkspace { int device_id_{}; }; -inline cvcuda::WorkspaceRequirements MaxWorkspaceRequirements( - const cvcuda::WorkspaceRequirements &a, const cvcuda::WorkspaceRequirements &b) { - cvcuda::WorkspaceRequirements max; - max.hostMem.size = std::max(a.hostMem.size, b.hostMem.size); - max.hostMem.alignment = std::max(a.hostMem.alignment, b.hostMem.alignment); - max.pinnedMem.size = std::max(a.pinnedMem.size, b.pinnedMem.size); - max.pinnedMem.alignment = std::max(a.pinnedMem.alignment, b.pinnedMem.alignment); - max.cudaMem.size = std::max(a.cudaMem.size, b.cudaMem.size); - max.cudaMem.alignment = std::max(a.cudaMem.alignment, b.cudaMem.alignment); - return max; -} +/** + * @brief Create an NVCV allocator using the given scratchpad. + */ +nvcv::Allocator GetScratchpadAllocator(kernels::Scratchpad &scratchpad); /** * @brief A base class for the CVCUDA operators.