Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Implement pre-packed blobs serialization on disk and their memory mapping on load #23069

Merged
merged 10 commits into from
Dec 20, 2024
4 changes: 3 additions & 1 deletion include/onnxruntime/core/framework/buffer_deleter.h
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@

#include "core/framework/allocator.h"

#include <functional>

Check warning on line 8 in include/onnxruntime/core/framework/buffer_deleter.h

View workflow job for this annotation

GitHub Actions / Optional Lint C++

[cpplint] reported by reviewdog 🐶 Found C++ system header after other header. Should be: buffer_deleter.h, c system, c++ system, other. [build/include_order] [4] Raw Output: include/onnxruntime/core/framework/buffer_deleter.h:8: Found C++ system header after other header. Should be: buffer_deleter.h, c system, c++ system, other. [build/include_order] [4]

namespace onnxruntime {

// TODO: Do we need this class or is IAllocator::MakeUniquePtr sufficient/better
Expand All @@ -31,6 +33,6 @@
AllocatorPtr alloc_{nullptr};
};

using BufferUniquePtr = std::unique_ptr<void, BufferDeleter>;
using BufferUniquePtr = std::unique_ptr<void, std::function<void(void*)>>;

Check warning on line 36 in include/onnxruntime/core/framework/buffer_deleter.h

View workflow job for this annotation

GitHub Actions / Optional Lint C++

[cpplint] reported by reviewdog 🐶 Add #include <memory> for unique_ptr<> [build/include_what_you_use] [4] Raw Output: include/onnxruntime/core/framework/buffer_deleter.h:36: Add #include <memory> for unique_ptr<> [build/include_what_you_use] [4]
using BufferNakedPtr = void*;
} // namespace onnxruntime
58 changes: 25 additions & 33 deletions include/onnxruntime/core/graph/graph.h
Original file line number Diff line number Diff line change
Expand Up @@ -19,10 +19,10 @@
#include "core/common/common.h"
#include "core/common/path_string.h"
#include "core/common/const_pointer_container.h"
#include "core/common/inlined_containers_fwd.h"
#if !defined(ORT_MINIMAL_BUILD)
#include "core/common/inlined_containers.h"
#endif
#include "core/common/inlined_containers_fwd.h"
#include "core/common/span_utils.h"
#include "core/common/status.h"
#include "core/common/logging/logging.h"
Expand All @@ -41,6 +41,7 @@ namespace onnxruntime {
class Graph;
struct IndexedSubGraph;
class Model;
struct ModelSavingOptions;
class OpSignature;

#if !defined(ORT_MINIMAL_BUILD) || defined(ORT_EXTENDED_MINIMAL_BUILD)
Expand Down Expand Up @@ -1153,29 +1154,6 @@ class Graph { // NOLINT(clang-analyzer-optin.performance.Padding): preserve exi
const ONNX_NAMESPACE::GraphProto& ToGraphProto();
ONNX_NAMESPACE::GraphProto ToGraphProto() const;

// Options to align external initializer offset.
// For models running on CPU, ORT will try to use mmap to load external initializers.
// To use mmap, external initializer need to be offset aligned.
// ORT saves external initializers into signle data file, each initializer is accessed with
// offset(start position of initializer) and length(byte length of initializer) of the data file.
// To use mmap, each offset need to be aligned which means offset need to divisible by
// allocation granularity(64KB for windows and 4K for other OSes).
// With align_offset to true, ORT will align offset for large initializer when
// save ONNX model with external data file.
struct OffsetAlignmentInfo {
// Offset will always be page aligned and allocation granularity aligned for mmap support.
// This is done by padding previous tensor data with zeros keeping same length.
bool align_offset = false;
// Alignment threshold for size of data.
// Having a low threshold will waste file space for small initializers.
// Only when tensor's data size is > the page_align_threshold it will be force aligned.
// Default to 1MB.
int64_t align_threshold = 1048576;
// The allocation Granularity for mmap() support.
// Typically 64KB for Windows & 4KB for other OSes. Default to 64KB.
int64_t allocation_granularity = 65536;
};

/** Gets the GraphProto representation of this Graph
@param external_file_path File path of the binary file to use for initializers.
@param model_file_path path of the model file.
Expand All @@ -1186,15 +1164,7 @@ class Graph { // NOLINT(clang-analyzer-optin.performance.Padding): preserve exi
*/
ONNX_NAMESPACE::GraphProto ToGraphProtoWithExternalInitializers(const std::filesystem::path& external_file_path,
const std::filesystem::path& model_file_path,
size_t initializer_size_threshold,
const OffsetAlignmentInfo& align_info) const;

ONNX_NAMESPACE::GraphProto ToGraphProtoWithExternalInitializers(const std::filesystem::path& external_file_path,
const std::filesystem::path& model_file_path,
size_t initializer_size_threshold) const {
OffsetAlignmentInfo default_options;
return ToGraphProtoWithExternalInitializers(external_file_path, model_file_path, initializer_size_threshold, default_options);
}
const ModelSavingOptions& model_saving_options) const;

/** Gets the ISchemaRegistry instances being used with this Graph. */
IOnnxRuntimeOpSchemaCollectionPtr GetSchemaRegistry() const;
Expand Down Expand Up @@ -1519,6 +1489,28 @@ class Graph { // NOLINT(clang-analyzer-optin.performance.Padding): preserve exi
Status AddConstantProtoAsInitializer(const ONNX_NAMESPACE::NodeProto& constant_node_proto,
std::optional<std::string_view> new_name);

/// <summary>
/// A map that is used to keep track of pre-packed blobs to be serialized
/// The implementation adds pre-packed external data references to the TensorProto
/// that contains the initializer data. However, it may be an outerscope initializer.
/// Thus we need to keep track of the pre-packed blobs that are not serialized in this
/// graph, so the parent can make sure it is being serialized.
yuslepukhin marked this conversation as resolved.
Show resolved Hide resolved
///
/// The below map has <weight_name, std::vector<blob_key_name>>. This contains
yuslepukhin marked this conversation as resolved.
Show resolved Hide resolved
/// the entries that are not serialized in this graph, and the parent must check in them
/// </summary>
using WeightToPrePacksMap = NodeHashMap<std::string, InlinedHashSet<std::string>>;

Status ToGraphProtoWithExternalInitiallizersImpl(
yuslepukhin marked this conversation as resolved.
Show resolved Hide resolved
const std::filesystem::path& model_path,
const std::filesystem::path& external_file_path,
const std::filesystem::path& modified_external_file_path,
const ModelSavingOptions& model_saving_options,
WeightToPrePacksMap& unprocessed_prepacks,
ONNX_NAMESPACE::GraphProto& graph_proto,
std::ostream& external_stream,
int64_t& external_offset) const;

#endif

Version IrVersion() const noexcept {
Expand Down
44 changes: 44 additions & 0 deletions include/onnxruntime/core/graph/model_saving_options.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.

#pragma once

namespace onnxruntime {

class PrepackedForSerialization;

// These options that affect how the model initializers are saved.
yuslepukhin marked this conversation as resolved.
Show resolved Hide resolved
// This includes options to align external initializer offset.
// For models running on CPU, ORT will try to use mmap to load external
// initializers. To use mmap, external initializer need to be offset aligned.
// ORT saves external initializers into signle data file, each initializer is
yuslepukhin marked this conversation as resolved.
Show resolved Hide resolved
// accessed with offset(start position of initializer) and length(byte length of
// initializer) of the data file. To use mmap, each offset need to be aligned
// which means offset need to divisible by allocation granularity(64KB for
// windows and 4K for other OSes). With align_offset to true, ORT will align
yuslepukhin marked this conversation as resolved.
Show resolved Hide resolved
// offset for large initializer when save ONNX model with external data file.
struct ModelSavingOptions {
explicit ModelSavingOptions(size_t size_threshold)
: initializer_size_threshold(size_threshold) {}

// Mimimal initializer size in bytes to be externalized on disk
size_t initializer_size_threshold;
// Offset will always be page aligned and allocation granularity aligned for
// mmap support. This is done by padding previous tensor data with zeros
// keeping same length.
bool align_offset = false;
// Alignment threshold for size of data.
// Having a low threshold will waste file space for small initializers.
// Only when tensor's data size is > the page_align_threshold it will be force
// aligned. Default to 1MB.
int64_t align_threshold = 1048576;
// The allocation Granularity for mmap() support.
// Typically 64KB for Windows & 4KB for other OSes. Default to 64KB.
int64_t allocation_granularity = 65536;
yuslepukhin marked this conversation as resolved.
Show resolved Hide resolved
// Optional pointer to a container of pre-packed initializers to be
// embedded into the external initializers, so they can also be loaded
// from disk.
const PrepackedForSerialization* prepacked_for_save = nullptr;
};

} // namespace onnxruntime
Original file line number Diff line number Diff line change
Expand Up @@ -250,6 +250,15 @@ static const char* const kOrtSessionOptionsOptimizedModelExternalInitializersFil
static const char* const kOrtSessionOptionsOptimizedModelExternalInitializersMinSizeInBytes =
"session.optimized_model_external_initializers_min_size_in_bytes";

// Use this config when save pre-packed constant initializers to an external data file.
// This allows to minimize ONNX model file size and memory map pre-packed initializers on
// model load.
// - "0": Default is not save pre-packed initializers to a data file.
// - "1": Save pre-packed constant initializers to an external data file.
yuslepukhin marked this conversation as resolved.
Show resolved Hide resolved
// Sample usage: sess_options.add_session_config_entry(kOrtSessionOptionsSavePrePackedConstantInitializers, "1")
static const char* const kOrtSessionOptionsSavePrePackedConstantInitializers =
"session.save_external_prepacked_constant_initializers";

// Enable EP context feature to dump the partitioned graph which includes the EP context into Onnx file.
// The dumped Onnx model with EP context can be used for future inference to avoid the EP graph partitioning/compile overhead.
// "0": disable. (default)
Expand Down
8 changes: 6 additions & 2 deletions onnxruntime/core/framework/prepacked_weights.h
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
#include <vector>

#include "core/common/basic_types.h"
#include "core/common/inlined_containers_fwd.h"
#include "core/framework/buffer_deleter.h"
#include "core/framework/tensor_shape.h"

Expand All @@ -16,11 +17,14 @@ struct PrePackedWeights final {
// Hence we hold them in container. It is upto the developer implementing each PrePack()
// method to define what gets stored in which position of the container.

std::vector<IAllocatorUniquePtr<void>> buffers_; // cache pre-packed buffers associated with the kernel
std::vector<size_t> buffer_sizes_; // cache sizes of pre-packed buffers (in bytes)
InlinedVector<BufferUniquePtr> buffers_; // cache pre-packed buffers associated with the kernel
yuslepukhin marked this conversation as resolved.
Show resolved Hide resolved
InlinedVector<size_t> buffer_sizes_; // cache sizes of pre-packed buffers (in bytes)

// Produces a hash of the buffers stored in the given instance of this class
HashValue GetHash() const;

// The function creates a copy with non-owning BufferUniquePtrs.
PrePackedWeights CreateReferringCopy() const;
};

} // namespace onnxruntime
95 changes: 95 additions & 0 deletions onnxruntime/core/framework/prepacked_weights_container.cc
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,21 @@

#include "core/framework/prepacked_weights_container.h"
#include "core/framework/allocator_utils.h"
#include "core/graph/graph.h"

namespace onnxruntime {

PrePackedWeights PrePackedWeights::CreateReferringCopy() const {
PrePackedWeights copy;
for (const auto& prepacked_buffer : buffers_) {
// BufferDeleter is nullptr because we do not own the data in this case
copy.buffers_.emplace_back(prepacked_buffer.get(), BufferDeleter(nullptr));
}

copy.buffer_sizes_ = buffer_sizes_;
return copy;
}

AllocatorPtr PrepackedWeightsContainer::GetOrCreateAllocator(const std::string& device_name) {
auto iter = allocators_.find(device_name);

Expand Down Expand Up @@ -49,4 +61,87 @@
return prepacked_weights_map_.size();
}

PrepackedForSerialization::PrepackedForSerialization()
: main_graph_(nullptr, key_to_blobs_, false) {
}

PrepackedForSerialization::~PrepackedForSerialization() = default;

void PrepackedForSerialization::Subgraph::InsertFromDisk(const std::string& key, PrePackedWeights&& packed_weight) {
yuslepukhin marked this conversation as resolved.
Show resolved Hide resolved
// We may have duplicate entries mapped from disk if the same weight is pre-packed from subgraphs and
// up the tree by the same kernel with the same result. The map prevents this from happening.
key_to_blobs_.emplace(key, std::move(packed_weight));
}

void PrepackedForSerialization::Subgraph::WritePacked(const std::string& weight_name, const std::string& key,
PrePackedWeights&& packed_weight) {
auto hit = key_to_blobs_.find(key);
if (hit == key_to_blobs_.end()) {
// new key
key_to_blobs_.emplace(key, std::move(packed_weight));
if (save_mode_on_) {
sorted_by_weight_for_writing_[weight_name].insert(key);
}
return;
}

// Key existed, but may or may not have a reference in this subgraph
if (save_mode_on_) {
auto& list = sorted_by_weight_for_writing_[weight_name];
list.insert(key);
}
hit->second = std::move(packed_weight);
}

const PrePackedWeights* PrepackedForSerialization::Subgraph::GetPrepackedWeights(const std::string& key) const {
auto it = key_to_blobs_.find(key);
if (it == key_to_blobs_.end()) {
return nullptr;
}
return &it->second;
}

std::optional<PrePackedWeights> PrepackedForSerialization::Subgraph::ReplaceWithReferenceIfSaving(
const std::string& weight_name,
const std::string& key,

Check warning on line 106 in onnxruntime/core/framework/prepacked_weights_container.cc

View workflow job for this annotation

GitHub Actions / Optional Lint C++

[cpplint] reported by reviewdog 🐶 Add #include <string> for string [build/include_what_you_use] [4] Raw Output: onnxruntime/core/framework/prepacked_weights_container.cc:106: Add #include <string> for string [build/include_what_you_use] [4]
const PrePackedWeights& refer_if_absent) {
yuslepukhin marked this conversation as resolved.
Show resolved Hide resolved
auto it = key_to_blobs_.find(key);
if (it == key_to_blobs_.end()) {
if (save_mode_on_) {
key_to_blobs_.emplace(key, refer_if_absent.CreateReferringCopy());
sorted_by_weight_for_writing_[weight_name].insert(key);
}
return std::nullopt;
}

PrePackedWeights result = std::move(it->second);

Check warning on line 117 in onnxruntime/core/framework/prepacked_weights_container.cc

View workflow job for this annotation

GitHub Actions / Optional Lint C++

[cpplint] reported by reviewdog 🐶 Add #include <utility> for move [build/include_what_you_use] [4] Raw Output: onnxruntime/core/framework/prepacked_weights_container.cc:117: Add #include <utility> for move [build/include_what_you_use] [4]
if (save_mode_on_) {
it->second = result.CreateReferringCopy();
auto& list = sorted_by_weight_for_writing_[weight_name];
list.insert(key);
} else {
key_to_blobs_.erase(it);
}
return result;
}

PrepackedForSerialization::Subgraph& PrepackedForSerialization::FindOrCreatePrepackedGraph(const Graph& graph) {
if (graph.ParentGraph() == nullptr) {
return main_graph_;
}
auto& parent = FindOrCreatePrepackedGraph(*graph.ParentGraph());
return parent.GetOrCreateSubgraph(graph);
}

const PrepackedForSerialization::Subgraph* PrepackedForSerialization::FindPrepackedGraph(const Graph& graph) const {
if (graph.ParentGraph() == nullptr) {
return &main_graph_;
}
auto* parent = FindPrepackedGraph(*graph.ParentGraph());
if (parent != nullptr) {
parent = parent->GetSubgraph(graph);
}
return parent;
}

} // namespace onnxruntime
Loading
Loading