Clarify the header JSON structure and note it may be padded

huggingface · Aug 1, 2023 · e3e11f7 · e3e11f7
1 parent 3178f6f
commit e3e11f7
Show file tree

Hide file tree

Showing 4 changed files with 21 additions and 6 deletions.
diff --git a/README.md b/README.md
@@ -76,8 +76,10 @@ with safe_open("model.safetensors", framework="pt", device="cpu") as f:
 ### Format
 
 - 8 bytes: `N`, an unsigned little-endian 64-bit integer, containing the size of the header
-- N bytes: a JSON utf-8 string representing the header.
-  - The header is a dict like `{"TENSOR_NAME": {"dtype": "F16", "shape": [1, 16, 256], "data_offsets": [BEGIN, END]}, "NEXT_TENSOR_NAME": {...}, ...}`, where offsets point to the tensor data relative to the beginning of the byte buffer, with `BEGIN` as the starting offset and `END` as the one-past offset (so total tensor byte size = `END - BEGIN`).
+- N bytes: a JSON UTF-8 string representing the header; may be padded on either side with whitespace (any bytes 0x09, 0x0A, 0x0D, 0x20).
+  - The header is a dict like `{"TENSOR_NAME": {"dtype": "F16", "shape": [1, 16, 256], "data_offsets": [BEGIN, END]}, "NEXT_TENSOR_NAME": {...}, ...}`,
+    - `data_offsets` point to the tensor data relative to the beginning of the byte buffer (i.e. not an absolute position in the file),
+      with `BEGIN` as the starting offset and `END` as the one-past offset (so total tensor byte size = `END - BEGIN`).
   - A special key `__metadata__` is allowed to contain free form string-to-string map. Arbitrary JSON is not allowed, all values must be strings.
 - Rest of the file: byte-buffer.
 

diff --git a/safetensors/README.md b/safetensors/README.md
@@ -76,8 +76,10 @@ with safe_open("model.safetensors", framework="pt", device="cpu") as f:
 ### Format
 
 - 8 bytes: `N`, an unsigned little-endian 64-bit integer, containing the size of the header
-- N bytes: a JSON utf-8 string representing the header.
-  - The header is a dict like `{"TENSOR_NAME": {"dtype": "F16", "shape": [1, 16, 256], "data_offsets": [BEGIN, END]}, "NEXT_TENSOR_NAME": {...}, ...}`, where offsets point to the tensor data relative to the beginning of the byte buffer, with `BEGIN` as the starting offset and `END` as the one-past offset (so total tensor byte size = `END - BEGIN`).
+- N bytes: a JSON UTF-8 string representing the header; may be padded on either side with whitespace (any bytes 0x09, 0x0A, 0x0D, 0x20).
+  - The header is a dict like `{"TENSOR_NAME": {"dtype": "F16", "shape": [1, 16, 256], "data_offsets": [BEGIN, END]}, "NEXT_TENSOR_NAME": {...}, ...}`,
+    - `data_offsets` point to the tensor data relative to the beginning of the byte buffer (i.e. not an absolute position in the file),
+      with `BEGIN` as the starting offset and `END` as the one-past offset (so total tensor byte size = `END - BEGIN`).
   - A special key `__metadata__` is allowed to contain free form string-to-string map. Arbitrary JSON is not allowed, all values must be strings.
 - Rest of the file: byte-buffer.
 

diff --git a/safetensors/src/lib.rs b/safetensors/src/lib.rs
@@ -53,8 +53,10 @@
 //!## Format
 //!
 //! - 8 bytes: `N`, an unsigned little-endian 64-bit integer, containing the size of the header
-//! - N bytes: a JSON utf-8 string representing the header.
-//!   - The header is a dict like `{"TENSOR_NAME": {"dtype": "F16", "shape": [1, 16, 256], "data_offsets": [BEGIN, END]}, "NEXT_TENSOR_NAME": {...}, ...}`, where offsets point to the tensor data relative to the beginning of the byte buffer, with `BEGIN` as the starting offset and `END` as the one-past offset (so total tensor byte size = `END - BEGIN`).
+//! - N bytes: a JSON UTF-8 string representing the header; may be padded on either side with whitespace (any bytes 0x09, 0x0A, 0x0D, 0x20).
+//!   - The header is a dict like `{"TENSOR_NAME": {"dtype": "F16", "shape": [1, 16, 256], "data_offsets": [BEGIN, END]}, "NEXT_TENSOR_NAME": {...}, ...}`,
+//!   - `data_offsets` point to the tensor data relative to the beginning of the byte buffer (i.e. not an absolute position in the file),
+//!     with `BEGIN` as the starting offset and `END` as the one-past offset (so total tensor byte size = `END - BEGIN`).
 //!   - A special key `__metadata__` is allowed to contain free form string-to-string map. Arbitrary JSON is not allowed, all values must be strings.
 //! - Rest of the file: byte-buffer.
 //!

diff --git a/safetensors/src/tensor.rs b/safetensors/src/tensor.rs
@@ -1087,6 +1087,15 @@ mod tests {
         }
     }
 
+    #[test]
+    /// Test that the JSON header may be padded with JSON whitespace characters
+    /// on either side.
+    fn test_whitespace_padded_header() {
+        let serialized = b"\x06\x00\x00\x00\x00\x00\x00\x00\x09\x0A{}\x0D\x20";
+        let loaded = SafeTensors::deserialize(serialized).unwrap();
+        assert_eq!(loaded.len(), 0);
+    }
+
     #[test]
     fn test_zero_sized_tensor() {
         let serialized = b"<\x00\x00\x00\x00\x00\x00\x00{\"test\":{\"dtype\":\"I32\",\"shape\":[2,0],\"data_offsets\":[0, 0]}}";