Revert "enable serialize prepacked weights into data file (microsoft#…

…22256)" (microsoft#22788) This reverts commit c5b6be0. ### Description Revert ### Motivation and Context This needs simpler and more robust approach
intel · Dec 11, 2024 · 2006a22 · 2006a22
1 parent 7429f73
commit 2006a22
Show file tree

Hide file tree

Showing 72 changed files with 137 additions and 872 deletions.
diff --git a/include/onnxruntime/core/framework/op_kernel.h b/include/onnxruntime/core/framework/op_kernel.h
@@ -79,7 +79,6 @@ class OpKernel {
   //               the allocator tied to the session if the kernel owns the pre-packed buffer or an
   //               allocator shared between sessions if the pre-packed buffer is to be shared across sessions
   //               (i.e.) the kernel does not own the buffer.
-  // @param save_prepacked_initializers: Set it to true if intend to save prepacked initializers to external data file.
   // @param is_packed: Set it to true if the kernel packed the tensor or to false
   //                   The kernel is responsible for keeping the packed data and related metadata if is_packed is true,
   //                   and the original initialized constant tensor will be released and not accessible anymore in
@@ -89,7 +88,6 @@ class OpKernel {
 
   virtual Status
   PrePack(const Tensor& /*tensor*/, int /*input_idx*/, AllocatorPtr /*alloc*/,
-          bool, /*save_prepacked_initializers*/
           /*out*/ bool& is_packed, /*out*/ PrePackedWeights* /*prepacked_weights*/) {
     is_packed = false;
     return Status::OK();
@@ -131,26 +129,6 @@ class OpKernel {
     return Status::OK();
   }
 
-  // Override this function to get pre-packed tensors from this kernel.
-  // Only useful for models run on PC with CPU so ORT could load prepacked weights directly from
-  // ONNX data file with mmap and no need to do prepacking on fly to save a lot of heap memory.
-  // @param input_idx : The index of input we prepacked before and intend to get packed tensor back.
-  // Please refer to matmul_nbits kernel for a complete example.
-  virtual std::optional<Tensor> GetPrePackTensor(int /*input_idx*/) {
-    return std::nullopt;
-  }
-
-  // Override this function to set pre-packed tensors to this kernel and restore prepacked weight buffer.
-  // Only useful for models run on PC with CPU so ORT could load prepacked weights directly from
-  // ONNX data file with mmap and no need to do prepacking on fly to save a lot of heap memory.
-  // Please refer to matmul_nbits kernel for a complete example.
-  // @param input_idx : The input index of the tensor in this kernel.
-  // @param pre_packed_tensor: The prepacked tensor read from onnx data file and use the prepacked tensor
-  // to restore prepacked weight buffer.
-  virtual Status SetPrePackTensor(int /*input_idx*/, const Tensor& /*pre_packed_tensor*/) {
-    return Status::OK();
-  }
-
   const OrtDevice GetDevice(OrtMemType mem_type) const;
   const OpKernelInfo& Info() const {
     return *op_kernel_info_;

diff --git a/include/onnxruntime/core/graph/graph.h b/include/onnxruntime/core/graph/graph.h
@@ -1148,11 +1148,6 @@ class Graph {  // NOLINT(clang-analyzer-optin.performance.Padding): preserve exi
   void FinalizeFuseSubGraph(const IndexedSubGraph& sub_graph, Node& fused_node);
 #endif
 
-  // Since one constant initializer could be used by different kernels
-  // and prepacked differently, use an unordered_map to store prepacked
-  // initializer in format of <[initializer_name], <[node_name], [prepacked_initializer]>>
-  typedef std::unordered_map<std::string, std::unordered_map<std::string, ONNX_NAMESPACE::TensorProto>> PrePackedTensorProtoToSave;
-
 #if !defined(ORT_MINIMAL_BUILD)
   /** Gets the GraphProto representation of this Graph. */
   const ONNX_NAMESPACE::GraphProto& ToGraphProto();
@@ -1187,26 +1182,18 @@ class Graph {  // NOLINT(clang-analyzer-optin.performance.Padding): preserve exi
   @param initializer_size_threshold initializers larger or equal to this threshold (in bytes) are saved
   in the external file. Initializer smaller than this threshold are included in the onnx file.
   @param align_info offset alignment info.
-  @param save_prepacked_constant_initializers whether to save prepacked initializer into external data file.
-         If set false to this boolean, prepacked initializer will not be saved into onnxruntime data file,
-         we keep constant initializer as it is.
-  @param pre_packed_initializers struct used to store all the prepacked initializers.
   @returns GraphProto serialization of the graph.
   */
   ONNX_NAMESPACE::GraphProto ToGraphProtoWithExternalInitializers(const std::filesystem::path& external_file_path,
                                                                   const std::filesystem::path& model_file_path,
                                                                   size_t initializer_size_threshold,
-                                                                  const OffsetAlignmentInfo& align_info,
-                                                                  bool save_prepacked_constant_initializers,
-                                                                  PrePackedTensorProtoToSave& pre_packed_initializers) const;
+                                                                  const OffsetAlignmentInfo& align_info) const;
 
   ONNX_NAMESPACE::GraphProto ToGraphProtoWithExternalInitializers(const std::filesystem::path& external_file_path,
                                                                   const std::filesystem::path& model_file_path,
                                                                   size_t initializer_size_threshold) const {
     OffsetAlignmentInfo default_options;
-    PrePackedTensorProtoToSave pre_packed_initializers;
-    return ToGraphProtoWithExternalInitializers(external_file_path, model_file_path, initializer_size_threshold, default_options,
-                                                false, pre_packed_initializers);
+    return ToGraphProtoWithExternalInitializers(external_file_path, model_file_path, initializer_size_threshold, default_options);
   }
 
   /** Gets the ISchemaRegistry instances being used with this Graph. */
@@ -1521,18 +1508,6 @@ class Graph {  // NOLINT(clang-analyzer-optin.performance.Padding): preserve exi
  private:
   void InitializeStateFromModelFileGraphProto();
 
-  // Private method used to setup external initializer properly during model save,
-  // this external initializer could be oroginal initializer or prepacked initializer.
-  static void SetUpExternalInitializer(const Graph::OffsetAlignmentInfo& align_info,
-                                       size_t tensor_bytes_size,
-                                       int64_t& external_offset,
-                                       std::ofstream& external_stream,
-                                       gsl::span<const uint8_t> raw_data,
-                                       ONNX_NAMESPACE::TensorProto& output_proto,
-                                       const std::filesystem::path& external_file_path,
-                                       const ONNX_NAMESPACE::TensorProto& initializer,
-                                       bool is_prepacked);
-
   // Add node with specified <node_proto>.
   Node& AddNode(const ONNX_NAMESPACE::NodeProto& node_proto,
                 const ArgNameToTypeMap& name_to_type);

diff --git a/include/onnxruntime/core/session/onnxruntime_session_options_config_keys.h b/include/onnxruntime/core/session/onnxruntime_session_options_config_keys.h
@@ -246,12 +246,6 @@ static const char* const kOrtSessionOptionsDisableCPUEPFallback = "session.disab
 static const char* const kOrtSessionOptionsOptimizedModelExternalInitializersFileName =
     "session.optimized_model_external_initializers_file_name";
 
-// Use this config when save prepacked constant initializers to onnx external data file.
-// Default is not save prepacked initializers to onnx data file.
-// Sample usage: sess_options.add_session_config_entry('session.save_prepacked_constant_initializers',  "1")
-static const char* const kOrtSessionOptionsSavePrePackedConstantInitializers =
-    "session.save_prepacked_constant_initializers";
-
 // Use this config to control the minimum size of the initializer when externalizing it during serialization
 static const char* const kOrtSessionOptionsOptimizedModelExternalInitializersMinSizeInBytes =
     "session.optimized_model_external_initializers_min_size_in_bytes";

diff --git a/onnxruntime/contrib_ops/cpu/bert/attention.cc b/onnxruntime/contrib_ops/cpu/bert/attention.cc
@@ -30,7 +30,6 @@ class Attention : public OpKernel, public AttentionCPUBase {
   Status Compute(OpKernelContext* context) const override;
 
   Status PrePack(const Tensor& tensor, int input_idx, AllocatorPtr alloc,
-                 bool save_prepacked_initializers,
                  /*out*/ bool& is_packed,
                  /*out*/ PrePackedWeights* prepacked_weights) override;
 
@@ -102,7 +101,6 @@ bool Attention<T>::IsPackWeightsSuccessful(int qkv_index,
 
 template <typename T>
 Status Attention<T>::PrePack(const Tensor& weights, int input_idx, AllocatorPtr alloc,
-                             bool /*save_prepacked_initializers*/,
                              /*out*/ bool& is_packed,
                              /*out*/ PrePackedWeights* prepacked_weights) {
   /* The PrePack() massages the weights to speed up Compute(), there is an option to

diff --git a/onnxruntime/contrib_ops/cpu/quantization/attention_quant.cc b/onnxruntime/contrib_ops/cpu/quantization/attention_quant.cc
@@ -24,7 +24,6 @@ class QAttention : public OpKernel, public AttentionCPUBase {
   Status Compute(OpKernelContext* context) const override;
 
   Status PrePack(const Tensor& tensor, int input_idx, AllocatorPtr alloc,
-                 bool save_prepacked_initializers,
                  bool& /*out*/ is_packed,
                  /*out*/ PrePackedWeights* prepacked_weights) override;
 
@@ -59,7 +58,6 @@ QAttention<T>::QAttention(const OpKernelInfo& info) : OpKernel(info), AttentionC
 
 template <typename T>
 Status QAttention<T>::PrePack(const Tensor& weights, int input_idx, AllocatorPtr alloc,
-                              bool /*save_prepacked_initializers*/,
                               /*out*/ bool& is_packed,
                               /*out*/ PrePackedWeights* prepacked_weights) {
   if (1 != input_idx) {

diff --git a/onnxruntime/contrib_ops/cpu/quantization/dynamic_quantize_lstm.cc b/onnxruntime/contrib_ops/cpu/quantization/dynamic_quantize_lstm.cc
@@ -13,7 +13,7 @@ class DynamicQuantizeLSTM : public OpKernel, public LSTMBase {
   DynamicQuantizeLSTM(const OpKernelInfo& info) : OpKernel(info), LSTMBase(info) {}
 
   Status PrePack(const Tensor& tensor, int input_idx,
-                 AllocatorPtr alloc, bool save_prepacked_initializers, /*out*/ bool& is_packed,
+                 AllocatorPtr alloc, /*out*/ bool& is_packed,
                  /*out*/ PrePackedWeights* prepacked_weights) override;
 
   Status UseSharedPrePackedBuffers(std::vector<BufferUniquePtr>& prepacked_buffers,
@@ -91,7 +91,6 @@ static void UseSharedPrePackedBuffersImpl(std::vector<BufferUniquePtr>& prepacke
 }
 
 Status DynamicQuantizeLSTM::PrePack(const Tensor& tensor, int input_idx, AllocatorPtr alloc,
-                                    bool /*save_prepacked_initializers*/,
                                     /*out*/ bool& is_packed,
                                     /*out*/ PrePackedWeights* prepacked_weights) {
   is_packed = false;

diff --git a/onnxruntime/contrib_ops/cpu/quantization/matmul_nbits.cc b/onnxruntime/contrib_ops/cpu/quantization/matmul_nbits.cc
@@ -98,19 +98,12 @@ class MatMulNBits final : public OpKernel {
   Status Compute(OpKernelContext* context) const override;
 
   Status PrePack(const Tensor& tensor, int input_idx, AllocatorPtr alloc,
-                 bool save_prepacked_initializers,
                  /*out*/ bool& is_packed,
                  /*out*/ PrePackedWeights* prepacked_weights) override;
 
-  void ConvertPrepackWeightIntoTensor(const onnxruntime::Tensor& tensor, int input_idx);
-
   Status UseSharedPrePackedBuffers(std::vector<BufferUniquePtr>& prepacked_buffers, int input_idx,
                                    /*out*/ bool& used_shared_buffers) override;
 
-  std::optional<Tensor> GetPrePackTensor(int /*input_idx*/) override;
-
-  Status SetPrePackTensor(int input_idx, const Tensor& pre_packed_tensor) override;
-
  private:
   const size_t K_;
   const size_t N_;
@@ -126,8 +119,6 @@ class MatMulNBits final : public OpKernel {
   size_t packed_b_size_{0};
   IAllocatorUniquePtr<float> scales_fp32_{};
   IAllocatorUniquePtr<float> bias_fp32_{};
-  std::optional<Tensor> packed_tensor_{std::nullopt};
-  MLDataType prepack_tensor_data_type_;
 
   bool has_zp_input_{false};
 
@@ -157,22 +148,8 @@ class MatMulNBits final : public OpKernel {
   }
 };
 
-template <typename T1>
-void MatMulNBits<T1>::ConvertPrepackWeightIntoTensor(const onnxruntime::Tensor& tensor, int input_idx) {
-  if (input_idx == InputIndex::B) {
-    prepack_tensor_data_type_ = tensor.DataType();
-  }
-
-  TensorShapeVector weights_dims = {static_cast<int64_t>((packed_b_size_ - 1) / prepack_tensor_data_type_->Size()) + 1};
-  packed_tensor_ = Tensor(prepack_tensor_data_type_,
-                          TensorShape(weights_dims),
-                          packed_b_.get(),
-                          OrtMemoryInfo(CPU, OrtAllocatorType::OrtDeviceAllocator));
-}
-
 template <typename T1>
 Status MatMulNBits<T1>::PrePack(const Tensor& tensor, int input_idx, /*out*/ AllocatorPtr alloc,
-                                bool save_prepacked_initializers,
                                 /*out*/ bool& is_packed,
                                 /*out*/ PrePackedWeights* prepacked_weights) {
   ORT_UNUSED_PARAMETER(prepacked_weights);
@@ -208,16 +185,11 @@ Status MatMulNBits<T1>::PrePack(const Tensor& tensor, int input_idx, /*out*/ All
 #endif  // MLAS_TARGET_AMD64_IX86
   }
 
-  if (save_prepacked_initializers) {
-    ConvertPrepackWeightIntoTensor(tensor, input_idx);
-  }
-
   return Status::OK();
 }
 
 template <>
 Status MatMulNBits<MLFloat16>::PrePack(const Tensor& tensor, int input_idx, /*out*/ AllocatorPtr alloc,
-                                       bool save_prepacked_initializers,
                                        /*out*/ bool& is_packed,
                                        /*out*/ PrePackedWeights* prepacked_weights) {
   ORT_UNUSED_PARAMETER(prepacked_weights);
@@ -267,34 +239,6 @@ Status MatMulNBits<MLFloat16>::PrePack(const Tensor& tensor, int input_idx, /*ou
 #endif  // MLAS_TARGET_AMD64_IX86
   }
 
-  if (save_prepacked_initializers) {
-    ConvertPrepackWeightIntoTensor(tensor, input_idx);
-  }
-
-  return Status::OK();
-}
-
-template <typename T1>
-std::optional<Tensor> MatMulNBits<T1>::GetPrePackTensor(int input_idx) {
-  // For this kernel, prepack is performed on input_B, and possibly scales, zeros_points.
-  // During compute process, scales and zeros_points will keep as it is and only use prepacked
-  // buffer to replace input_B.
-  // Inorder to cope with this logic, we need to return latest prepacked buffer and only serialize
-  // the latest one. So, we need to always return packed_tensor_ here not only for input_B.
-  ORT_UNUSED_PARAMETER(input_idx);
-  return std::move(packed_tensor_);
-}
-
-template <typename T1>
-Status MatMulNBits<T1>::SetPrePackTensor(int input_idx, const Tensor& pre_packed_tensor) {
-  if (input_idx == 1) {
-    // pre_packed_tensor is constant initialized tensor and its lifecycle is managed by session_state,
-    // session_state will release memory from pre_packed_tensor. packed_b_ will not release memory so
-    // pass empty/default buffer deleter here.
-    // const_cast here is temporary, will fix in follow up PR.
-    packed_b_ = BufferUniquePtr(const_cast<void*>(pre_packed_tensor.DataRaw()), BufferDeleter());
-  }
-
   return Status::OK();
 }
 

diff --git a/onnxruntime/contrib_ops/cpu/skip_layer_norm.cc b/onnxruntime/contrib_ops/cpu/skip_layer_norm.cc
@@ -278,7 +278,6 @@ Status SkipLayerNorm<T, simplified>::Compute(OpKernelContext* p_ctx) const {
 
 template <typename T, bool simplified>
 Status SkipLayerNorm<T, simplified>::PrePack(const Tensor& tensor, int input_idx, AllocatorPtr alloc,
-                                             bool /*save_prepacked_initializers*/,
                                              bool& is_packed, PrePackedWeights* prepacked_weights) {
   ORT_UNUSED_PARAMETER(prepacked_weights);
 

diff --git a/onnxruntime/contrib_ops/cpu/skip_layer_norm.h b/onnxruntime/contrib_ops/cpu/skip_layer_norm.h
@@ -16,7 +16,7 @@ class SkipLayerNorm final : public OpKernel {
   SkipLayerNorm(const OpKernelInfo& op_kernel_info);
   Status Compute(OpKernelContext* p_op_kernel_context) const override;
 
-  Status PrePack(const Tensor& tensor, int input_idx, AllocatorPtr alloc, bool save_prepacked_initializers,
+  Status PrePack(const Tensor& tensor, int input_idx, AllocatorPtr alloc,
                  bool& is_packed, PrePackedWeights* prepacked_weights) override;
 
  private:

diff --git a/onnxruntime/contrib_ops/cuda/diffusion/group_norm.cc b/onnxruntime/contrib_ops/cuda/diffusion/group_norm.cc
@@ -95,7 +95,6 @@ GroupNorm::GroupNorm(const OpKernelInfo& op_info) : CudaKernel(op_info) {
 }
 
 Status GroupNorm::PrePack(const Tensor& tensor, int input_idx, AllocatorPtr /*alloc*/,
-                          bool /*save_prepacked_initializers*/,
                           bool& is_packed, PrePackedWeights* /*prepacked_weights*/) {
   is_packed = false;
 

diff --git a/onnxruntime/contrib_ops/cuda/diffusion/group_norm.h b/onnxruntime/contrib_ops/cuda/diffusion/group_norm.h
@@ -17,7 +17,6 @@ class GroupNorm final : public CudaKernel {
   Status ComputeInternal(OpKernelContext* context) const override;
 
   Status PrePack(const Tensor& tensor, int input_idx, AllocatorPtr alloc,
-                 bool save_prepacked_initializers,
                  bool& is_packed, PrePackedWeights* prepacked_weights) override;
 
  private:

diff --git a/onnxruntime/contrib_ops/cuda/quantization/qordered_ops/qordered_attention.cc b/onnxruntime/contrib_ops/cuda/quantization/qordered_ops/qordered_attention.cc
@@ -99,7 +99,6 @@ Status QOrderedAttention::PutIntoMergedBias(const Tensor& tensor, AllocatorPtr a
 }
 
 Status QOrderedAttention::PrePack(const Tensor& tensor, int input_idx, /*out*/ AllocatorPtr alloc,
-                                  bool /*save_prepacked_initializers*/,
                                   /*out*/ bool& is_packed,
                                   /*out*/ PrePackedWeights* /*prepacked_weights*/) {
   is_packed = false;

diff --git a/onnxruntime/contrib_ops/cuda/quantization/qordered_ops/qordered_attention.h b/onnxruntime/contrib_ops/cuda/quantization/qordered_ops/qordered_attention.h
@@ -20,7 +20,6 @@ class QOrderedAttention final : public CudaKernel, public AttentionBase {
 
  public:
   Status PrePack(const Tensor& tensor, int input_idx, AllocatorPtr alloc,
-                 bool save_prepacked_initializers,
                  /*out*/ bool& is_packed,
                  /*out*/ PrePackedWeights* prepacked_weights) override;
 

diff --git a/onnxruntime/contrib_ops/cuda/quantization/qordered_ops/qordered_matmul.cc b/onnxruntime/contrib_ops/cuda/quantization/qordered_ops/qordered_matmul.cc
@@ -51,7 +51,6 @@ QOrderedMatMul::QOrderedMatMul(const OpKernelInfo& info) : CudaKernel(info) {
 }
 
 Status QOrderedMatMul::PrePack(const Tensor& tensor, int input_idx, AllocatorPtr alloc,
-                               bool /*save_prepacked_initializers*/,
                                /*out*/ bool& is_packed,
                                /*out*/ PrePackedWeights* /* prepacked_weights */) {
   is_packed = false;

diff --git a/onnxruntime/contrib_ops/cuda/quantization/qordered_ops/qordered_matmul.h b/onnxruntime/contrib_ops/cuda/quantization/qordered_ops/qordered_matmul.h
@@ -18,7 +18,6 @@ class QOrderedMatMul final : public CudaKernel {
   Status ComputeInternal(OpKernelContext* context) const override;
 
   Status PrePack(const Tensor& tensor, int input_idx, AllocatorPtr alloc,
-                 bool save_prepacked_initializers,
                  /*out*/ bool& is_packed,
                  /*out*/ PrePackedWeights* prepacked_weights) override;
 

diff --git a/onnxruntime/core/framework/session_options.h b/onnxruntime/core/framework/session_options.h
@@ -83,11 +83,6 @@ struct SessionOptions {
   // enable profiling for this session.
   bool enable_profiling = false;
 
-  // save pre-packed constant external initializers instead of original initializers to onnxruntime data file.
-  // Only useful for models run on PC with CPU so ORT could load prepacked weights directly from
-  // ONNX data file with mmap and no need to do prepacking on fly to save a lot of heap memory.
-  bool save_prepacked_constant_initializers = false;
-
   // Non empty filepath enables serialization of the transformed optimized model to the specified filepath.
   //
   // Set session config value for ORT_SESSION_OPTIONS_CONFIG_SAVE_MODEL_FORMAT to 'ORT' or 'ONNX' to explicitly
@@ -196,7 +191,6 @@ inline std::ostream& operator<<(std::ostream& os, const SessionOptions& session_
      << " execution_mode:" << session_options.execution_mode
      << " execution_order:" << session_options.execution_order
      << " enable_profiling:" << session_options.enable_profiling
-     << " save_prepacked_constant_initializers:" << session_options.save_prepacked_constant_initializers
      << " optimized_model_filepath:" << ORT_TSTR_CONVERT_TO_PRINTABLE_STRING(session_options.optimized_model_filepath)
      << " enable_mem_pattern:" << session_options.enable_mem_pattern
      << " enable_mem_reuse:" << session_options.enable_mem_reuse