From f5a05b52191e19d67225f332c19dfd53dac55843 Mon Sep 17 00:00:00 2001 From: Li Zhang Date: Wed, 8 Feb 2023 20:14:28 +0800 Subject: [PATCH] [Refactor] Support batch inference with shape clustering (#1668) * refactor `NetModule` * name * fix sorting * fix indices --- csrc/mmdeploy/net/net_module.cpp | 185 +++++++++++++++++++++++-------- 1 file changed, 139 insertions(+), 46 deletions(-) diff --git a/csrc/mmdeploy/net/net_module.cpp b/csrc/mmdeploy/net/net_module.cpp index 218b8e98ff..d9ded2b5b0 100644 --- a/csrc/mmdeploy/net/net_module.cpp +++ b/csrc/mmdeploy/net/net_module.cpp @@ -1,7 +1,9 @@ // Copyright (c) OpenMMLab. All rights reserved. -#include "net_module.h" +#include "mmdeploy/net/net_module.h" +#include +#include #include #include "mmdeploy/archive/value_archive.h" @@ -31,6 +33,11 @@ struct NetModule::Impl { is_profiling_ = true; } auto model = context["model"].get(); + for (const auto& meta : model.meta().models) { + if (meta.name == name) { + max_batch_size_ = meta.batch_size; + } + } OUTCOME_TRY(auto config, model.GetModelConfig(name)); device_ = context.value("device", Device{"cpu"}); stream_ = context.value("stream", Stream::GetDefault(device_)); @@ -78,7 +85,7 @@ struct NetModule::Impl { return success(); } - Result InferInputShape(const vector& input) { + static Result InferBatchShape(const vector& input) { auto batch_size = input.size(); auto& exemplar = input.front(); auto shape = exemplar.shape(); @@ -86,13 +93,13 @@ struct NetModule::Impl { return shape; } if (shape[0] != 1) { - MMDEPLOY_ERROR("unsupported shape for batch assemble: {}", shape); + MMDEPLOY_WARN("unsupported shape for batch assemble: {}", shape); return Status(eNotSupported); } for (int i = 1; i < input.size(); ++i) { auto& sample = input[i]; if (sample.shape() != shape) { - MMDEPLOY_ERROR("shapes are not consistent across the batch"); + MMDEPLOY_WARN("shapes are not consistent across the batch"); return Status(eNotSupported); } } @@ -100,90 +107,175 @@ struct NetModule::Impl { return shape; } - Result > InferInputShape(const vector >& inputs) { + static Result> InferBatchShape(const vector>& inputs) { vector shapes; shapes.reserve(inputs.size()); for (const auto& input : inputs) { - OUTCOME_TRY(auto shape, InferInputShape(input)); + OUTCOME_TRY(auto shape, InferBatchShape(input)); shapes.push_back(std::move(shape)); } return shapes; } - Result > Forward(const std::vector& input) { - // auto t0 = std::chrono::high_resolution_clock::now(); - // - auto batch_size = static_cast(input.size()); - - std::vector > input_samples; + Result>> CollectInputTensors(const vector& inputs) { + vector> input_samples; input_samples.reserve(inputs_.size()); for (const auto& t : inputs_) { auto name = input_mapping_.at(t.name()); - std::vector tmp; - tmp.reserve(input.size()); - for (int i = 0; i < input.size(); ++i) { - auto& sample = input[i]; + auto& tmp = input_samples.emplace_back(); + for (const auto& sample : inputs) { if (auto it = sample.find(name); it != sample.end()) { tmp.push_back(it->second); } else { - MMDEPLOY_ERROR("sample {} missing key {}", i, name); + MMDEPLOY_ERROR("sample {} missing key {}", &sample - inputs.data(), name); return Status(eInvalidArgument); } } - input_samples.push_back(std::move(tmp)); + } + return input_samples; + } + + void SaveBatch(vector> samples, vector indices, + vector>>& batch_tensors, + vector>& batch_shapes, + vector>& batch_sample_idxs) const { + if (auto maybe_batch_shape = InferBatchShape(samples)) { + batch_shapes.push_back(maybe_batch_shape.value()); + batch_tensors.push_back(std::move(samples)); + batch_sample_idxs.push_back(std::move(indices)); + } else { + // cannot assemble batch, do it one by one + for (int k = 0; k < indices.size(); ++k) { + auto& shapes = batch_shapes.emplace_back(); + auto& batch = batch_tensors.emplace_back(inputs_.size()); + batch_sample_idxs.push_back({indices[k]}); + for (int j = 0; j < inputs_.size(); ++j) { + shapes.push_back(samples[j][k].shape()); + batch[j].push_back(std::move(samples[j][k])); + } + } + } + } + + void SamplesToBatches(const vector>& input_samples, size_t n_samples, + vector>>& batch_tensors, + vector>& batch_shapes, + vector>& batch_sample_idxs) const { + // concat all shapes in samples to make comparison easier + vector> concat_shapes; + concat_shapes.reserve(n_samples); + for (size_t i = 0; i < n_samples; ++i) { + auto& shape = concat_shapes.emplace_back(); + for (const auto& input : input_samples) { + shape.insert(shape.end(), input[i].shape().begin(), input[i].shape().end()); + } + } + + // cluster samples by concatenated shapes + vector shape_idxs(concat_shapes.size()); + std::iota(shape_idxs.begin(), shape_idxs.end(), 0); + std::sort(shape_idxs.begin(), shape_idxs.end(), + [&concat_shapes](int i, int j) { return concat_shapes[i] < concat_shapes[j]; }); + shape_idxs.erase(std::unique(shape_idxs.begin(), shape_idxs.end(), + [&concat_shapes](int i, int j) { + return concat_shapes[i] == concat_shapes[j]; + }), + shape_idxs.end()); + + // generate batches of samples with equal shapes, limit the batch size by max_batch_size_ + for (const auto ref_shape_idx : shape_idxs) { + const auto& ref_shape = concat_shapes[ref_shape_idx]; + vector> samples(inputs_.size()); + vector indices; + for (size_t i = 0; i < concat_shapes.size(); ++i) { + if (concat_shapes[i] == ref_shape) { + for (size_t j = 0; j < inputs_.size(); ++j) { + samples[j].push_back(input_samples[j][i]); + } + indices.push_back(static_cast(i)); + if (indices.size() == max_batch_size_) { + SaveBatch(std::move(samples), std::move(indices), batch_tensors, batch_shapes, + batch_sample_idxs); + samples = vector>(inputs_.size()); + indices = {}; + } + } + } + if (!indices.empty()) { + SaveBatch(std::move(samples), std::move(indices), batch_tensors, batch_shapes, + batch_sample_idxs); + } + } + } + + Result> Forward(const vector& inputs) { + OUTCOME_TRY(auto input_samples, CollectInputTensors(inputs)); + + vector>> batch_tensors; + vector> batch_shapes; + vector> batch_sample_indices; + + SamplesToBatches(input_samples, inputs.size(), batch_tensors, batch_shapes, + batch_sample_indices); + + vector outputs(inputs.size()); + for (size_t i = 0; i < batch_tensors.size(); ++i) { + OUTCOME_TRY(net_->Reshape(batch_shapes[i])); + OUTCOME_TRY(CopyInputTensors(batch_tensors[i], batch_shapes[i])); + OUTCOME_TRY(net_->Forward()); + OUTCOME_TRY(CopyOutputTensors(batch_sample_indices[i], outputs)); + if (i + 1 < batch_tensors.size()) { // sync if not the last batch + OUTCOME_TRY(stream_.Wait()); + } } - // 1. calculate input shape - OUTCOME_TRY(auto input_shapes, InferInputShape(input_samples)); + if (is_profiling_) { + OUTCOME_TRY(stream_.Wait()); + } - // 2. call backend's reshape - OUTCOME_TRY(net_->Reshape(input_shapes)); + return outputs; + } - // 3. fill input tensor + Result CopyInputTensors(const vector>& batch, + const vector& shapes) const { for (int i = 0; i < inputs_.size(); ++i) { - auto& src = input_samples[i]; + auto& src = batch[i]; auto& dst = inputs_[i]; - if (dst.shape() != input_shapes[i]) { - MMDEPLOY_ERROR("inconsistent input shape, expect {}, got {}", input_shapes[i], dst.shape()); + if (dst.shape() != shapes[i]) { + MMDEPLOY_ERROR("inconsistent input shape, expect {}, got {}", shapes[i], dst.shape()); return Status(eFail); } if (src.size() > 1) { for (int j = 0; j < src.size(); ++j) { - auto slice = dst.Slice(j); - OUTCOME_TRY(src[j].CopyTo(slice, stream_)); + OUTCOME_TRY(dst.Slice(j).CopyFrom(src[j], stream_)); } } else { - OUTCOME_TRY(src[0].CopyTo(dst, stream_)); + OUTCOME_TRY(src.front().CopyTo(dst, stream_)); } } + return success(); + } - // 5. forward - OUTCOME_TRY(net_->Forward()); - - vector output(batch_size); - for (const auto& t : outputs_) { - auto name = output_mapping_.at(t.name()); - auto desc = t.desc(); + Result CopyOutputTensors(const vector& indices, vector& outputs) { + for (const auto& output : outputs_) { + auto name = output_mapping_.at(output.name()); + auto desc = output.desc(); desc.device = device_; Tensor tmp(desc); if (tmp.size()) { - OUTCOME_TRY(t.CopyTo(tmp, stream_)); + OUTCOME_TRY(output.CopyTo(tmp, stream_)); } else { MMDEPLOY_WARN("copy skipped due to zero sized tensor"); } - if (output.size() > 1) { - for (int i = 0; i < output.size(); ++i) { - output[i].emplace(name, tmp.Slice(i)); + if (indices.size() > 1) { + for (int i = 0; i < indices.size(); ++i) { + outputs[indices[i]].emplace(name, tmp.Slice(i)); } } else { - output[0].emplace(name, std::move(tmp)); + outputs[indices.front()].emplace(name, std::move(tmp)); } } - if (is_profiling_) { - OUTCOME_TRY(stream_.Wait()); - } - - return output; + return success(); } Device device_; @@ -195,6 +287,7 @@ struct NetModule::Impl { std::map input_mapping_; // outer scope to model output names std::map output_mapping_; + int max_batch_size_{1}; bool is_profiling_{false}; };