From c3982cabe93bed13607cd59c5d1a8eee8f2439d9 Mon Sep 17 00:00:00 2001 From: Roman Bredehoft Date: Wed, 3 Apr 2024 15:30:49 +0200 Subject: [PATCH 1/3] chore: improve save/load methods for encrypted data-frames --- deps_licenses/licenses_mac_silicon_user.txt | 10 ++--- .../licenses_mac_silicon_user.txt.md5 | 2 +- src/concrete/ml/pandas/_utils.py | 12 +++--- src/concrete/ml/pandas/dataframe.py | 43 +++++++++++++------ 4 files changed, 42 insertions(+), 25 deletions(-) diff --git a/deps_licenses/licenses_mac_silicon_user.txt b/deps_licenses/licenses_mac_silicon_user.txt index 90eb7760e..62c6f3330 100644 --- a/deps_licenses/licenses_mac_silicon_user.txt +++ b/deps_licenses/licenses_mac_silicon_user.txt @@ -2,14 +2,14 @@ Name, Version, License GitPython, 3.1.41, BSD License PyYAML, 6.0.1, MIT License anyio, 3.7.1, MIT License -boto3, 1.34.72, Apache Software License -botocore, 1.34.72, Apache Software License +boto3, 1.34.75, Apache Software License +botocore, 1.34.75, Apache Software License brevitas, 0.8.0, UNKNOWN certifi, 2023.7.22, Mozilla Public License 2.0 (MPL 2.0) charset-normalizer, 3.3.2, MIT License click, 8.1.7, BSD License coloredlogs, 15.0.1, MIT License -concrete-python, 2024.3.27, BSD-3-Clause +concrete-python, 2.6.0rc1, BSD-3-Clause dependencies, 2.0.1, BSD License dill, 0.3.8, BSD License exceptiongroup, 1.2.0, MIT License @@ -19,7 +19,7 @@ flatbuffers, 24.3.25, Apache Software License fsspec, 2024.3.1, BSD License gitdb, 4.0.11, BSD License h11, 0.14.0, MIT License -huggingface-hub, 0.22.1, Apache Software License +huggingface-hub, 0.22.2, Apache Software License humanfriendly, 10.0, MIT License hummingbird-ml, 0.4.8, MIT License idna, 3.6, BSD License @@ -67,7 +67,7 @@ tokenizers, 0.15.2, Apache Software License tomli, 2.0.1, MIT License torch, 1.13.1, BSD License tqdm, 4.66.2, MIT License; Mozilla Public License 2.0 (MPL 2.0) -transformers, 4.39.1, Apache Software License +transformers, 4.39.3, Apache Software License typing_extensions, 4.5.0, Python Software Foundation License tzdata, 2024.1, Apache Software License urllib3, 2.2.1, MIT License diff --git a/deps_licenses/licenses_mac_silicon_user.txt.md5 b/deps_licenses/licenses_mac_silicon_user.txt.md5 index 60fdcedbe..263ba1589 100644 --- a/deps_licenses/licenses_mac_silicon_user.txt.md5 +++ b/deps_licenses/licenses_mac_silicon_user.txt.md5 @@ -1 +1 @@ -8de2e8c13fe9a1fe80d9cce43dee7493 +74a229e0dccc68a1f77c7ca59dbf7614 diff --git a/src/concrete/ml/pandas/_utils.py b/src/concrete/ml/pandas/_utils.py index 766066bf6..fcf425ca6 100644 --- a/src/concrete/ml/pandas/_utils.py +++ b/src/concrete/ml/pandas/_utils.py @@ -133,28 +133,28 @@ def deserialize_elementwise(array: numpy.ndarray) -> numpy.ndarray: return numpy.vectorize(deserialize_value)(array) -def serialize_evaluation_keys(evaluation_keys: fhe.EvaluationKeys) -> str: +def serialize_evaluation_keys(evaluation_keys: fhe.EvaluationKeys) -> bytes: """Serialize the evaluation keys into a string of hexadecimal numbers. Args: evaluation_keys (fhe.EvaluationKeys): The evaluation keys to serialize. Returns: - str: The serialized evaluation keys as a string of hexadecimal numbers. + bytes: The serialized evaluation keys. """ - return serialize_value(evaluation_keys) + return evaluation_keys.serialize() -def deserialize_evaluation_keys(serialized_evaluation_keys: str) -> fhe.EvaluationKeys: +def deserialize_evaluation_keys(serialized_evaluation_keys: bytes) -> fhe.EvaluationKeys: """Deserialize the evaluation keys. Args: - serialized_evaluation_keys (str): The evaluation keys to deserialize. + serialized_evaluation_keys (bytes): The evaluation keys to deserialize. Returns: fhe.EvaluationKeys: The deserialized evaluation keys. """ - return fhe.EvaluationKeys.deserialize(bytes.fromhex(serialized_evaluation_keys)) + return fhe.EvaluationKeys.deserialize(serialized_evaluation_keys) def slice_hex_str(hex_str: str, n: int = 10) -> str: diff --git a/src/concrete/ml/pandas/dataframe.py b/src/concrete/ml/pandas/dataframe.py index 2c158a656..fef9aaff9 100644 --- a/src/concrete/ml/pandas/dataframe.py +++ b/src/concrete/ml/pandas/dataframe.py @@ -1,5 +1,6 @@ """Define the encrypted data-frame framework.""" import json +import zipfile from pathlib import Path from typing import Dict, Hashable, List, Optional, Sequence, Tuple, Union @@ -254,11 +255,12 @@ def merge( return joined_df - def _to_dict(self) -> Dict: - """Serialize the encrypted data-frame as a dictionary. + def _to_dict_and_eval_keys(self) -> Tuple[Dict, fhe.EvaluationKeys]: + """Serialize the encrypted data-frame as a dictionary and evaluations keys. Returns: Dict: The serialized data-frame. + fhe.EvaluationKeys: The serialized evaluations keys. """ # Serialize encrypted values element-wise encrypted_values = serialize_elementwise(self._encrypted_values) @@ -273,20 +275,20 @@ def _to_dict(self) -> Dict: output_dict = { "encrypted_values": encrypted_values.tolist(), "encrypted_nan": encrypted_nan, - "evaluation_keys": evaluation_keys, "column_names": self._column_names, "dtype_mappings": self._dtype_mappings, "api_version": self._api_version, } - return output_dict + return output_dict, evaluation_keys @classmethod - def _from_dict(cls, dict_to_load: Dict): - """Load a serialized encrypted data-frame from a dictionary. + def _from_dict_and_eval_keys(cls, dict_to_load: Dict, evaluation_keys: fhe.EvaluationKeys): + """Load a serialized encrypted data-frame from a dictionary and evaluations keys. Args: dict_to_load (Dict): The serialized encrypted data-frame. + evaluation_keys (fhe.EvaluationKeys): The serialized evaluations keys. Returns: EncryptedDataFrame: The loaded encrypted data-frame. @@ -295,7 +297,7 @@ def _from_dict(cls, dict_to_load: Dict): encrypted_values = deserialize_elementwise(dict_to_load["encrypted_values"]) encrypted_nan = deserialize_value(dict_to_load["encrypted_nan"]) - evaluation_keys = deserialize_evaluation_keys(dict_to_load["evaluation_keys"]) + evaluation_keys = deserialize_evaluation_keys(evaluation_keys) column_names = dict_to_load["column_names"] dtype_mappings = dict_to_load["dtype_mappings"] @@ -318,9 +320,16 @@ def save(self, path: Union[Path, str]): """ path = Path(path) - encrypted_df_dict = self._to_dict() - with path.open("w", encoding="utf-8") as file: - json.dump(encrypted_df_dict, file) + if path.suffix != ".zip": + path = path.with_suffix(".zip") + + encrypted_df_dict, evaluation_keys = self._to_dict_and_eval_keys() + + encrypted_df_json_bytes = json.dumps(encrypted_df_dict).encode(encoding="utf-8") + + with zipfile.ZipFile(path, "w") as zip_file: + zip_file.writestr("encrypted_dataframe.json", encrypted_df_json_bytes) + zip_file.writestr("evaluation_keys", evaluation_keys) @classmethod def load(cls, path: Union[Path, str]): @@ -334,7 +343,15 @@ def load(cls, path: Union[Path, str]): """ path = Path(path) - with path.open("r", encoding="utf-8") as file: - encrypted_df_dict = json.load(file) + if path.suffix != ".zip": + path = path.with_suffix(".zip") + + with zipfile.ZipFile(path, "r") as zip_file: + with zip_file.open("encrypted_dataframe.json") as encrypted_df_json_file: + encrypted_df_json_bytes = encrypted_df_json_file.read() + encrypted_df_dict = json.loads(encrypted_df_json_bytes) + + with zip_file.open("evaluation_keys") as evaluation_keys_file: + evaluation_keys = evaluation_keys_file.read() - return cls._from_dict(encrypted_df_dict) + return cls._from_dict_and_eval_keys(encrypted_df_dict, evaluation_keys) From 8cc956d37af66392a2de8f2a47e9a645fd218956 Mon Sep 17 00:00:00 2001 From: Roman Bredehoft Date: Thu, 4 Apr 2024 10:33:43 +0200 Subject: [PATCH 2/3] chore: add explicit parameters in zipfile --- deps_licenses/licenses_mac_silicon_user.txt | 10 +++++----- deps_licenses/licenses_mac_silicon_user.txt.md5 | 2 +- src/concrete/ml/pandas/_utils.py | 2 +- src/concrete/ml/pandas/dataframe.py | 6 +++--- 4 files changed, 10 insertions(+), 10 deletions(-) diff --git a/deps_licenses/licenses_mac_silicon_user.txt b/deps_licenses/licenses_mac_silicon_user.txt index 62c6f3330..90eb7760e 100644 --- a/deps_licenses/licenses_mac_silicon_user.txt +++ b/deps_licenses/licenses_mac_silicon_user.txt @@ -2,14 +2,14 @@ Name, Version, License GitPython, 3.1.41, BSD License PyYAML, 6.0.1, MIT License anyio, 3.7.1, MIT License -boto3, 1.34.75, Apache Software License -botocore, 1.34.75, Apache Software License +boto3, 1.34.72, Apache Software License +botocore, 1.34.72, Apache Software License brevitas, 0.8.0, UNKNOWN certifi, 2023.7.22, Mozilla Public License 2.0 (MPL 2.0) charset-normalizer, 3.3.2, MIT License click, 8.1.7, BSD License coloredlogs, 15.0.1, MIT License -concrete-python, 2.6.0rc1, BSD-3-Clause +concrete-python, 2024.3.27, BSD-3-Clause dependencies, 2.0.1, BSD License dill, 0.3.8, BSD License exceptiongroup, 1.2.0, MIT License @@ -19,7 +19,7 @@ flatbuffers, 24.3.25, Apache Software License fsspec, 2024.3.1, BSD License gitdb, 4.0.11, BSD License h11, 0.14.0, MIT License -huggingface-hub, 0.22.2, Apache Software License +huggingface-hub, 0.22.1, Apache Software License humanfriendly, 10.0, MIT License hummingbird-ml, 0.4.8, MIT License idna, 3.6, BSD License @@ -67,7 +67,7 @@ tokenizers, 0.15.2, Apache Software License tomli, 2.0.1, MIT License torch, 1.13.1, BSD License tqdm, 4.66.2, MIT License; Mozilla Public License 2.0 (MPL 2.0) -transformers, 4.39.3, Apache Software License +transformers, 4.39.1, Apache Software License typing_extensions, 4.5.0, Python Software Foundation License tzdata, 2024.1, Apache Software License urllib3, 2.2.1, MIT License diff --git a/deps_licenses/licenses_mac_silicon_user.txt.md5 b/deps_licenses/licenses_mac_silicon_user.txt.md5 index 263ba1589..60fdcedbe 100644 --- a/deps_licenses/licenses_mac_silicon_user.txt.md5 +++ b/deps_licenses/licenses_mac_silicon_user.txt.md5 @@ -1 +1 @@ -74a229e0dccc68a1f77c7ca59dbf7614 +8de2e8c13fe9a1fe80d9cce43dee7493 diff --git a/src/concrete/ml/pandas/_utils.py b/src/concrete/ml/pandas/_utils.py index fcf425ca6..2c7a6b7f6 100644 --- a/src/concrete/ml/pandas/_utils.py +++ b/src/concrete/ml/pandas/_utils.py @@ -134,7 +134,7 @@ def deserialize_elementwise(array: numpy.ndarray) -> numpy.ndarray: def serialize_evaluation_keys(evaluation_keys: fhe.EvaluationKeys) -> bytes: - """Serialize the evaluation keys into a string of hexadecimal numbers. + """Serialize the evaluation keys into bytes. Args: evaluation_keys (fhe.EvaluationKeys): The evaluation keys to serialize. diff --git a/src/concrete/ml/pandas/dataframe.py b/src/concrete/ml/pandas/dataframe.py index fef9aaff9..d4716702d 100644 --- a/src/concrete/ml/pandas/dataframe.py +++ b/src/concrete/ml/pandas/dataframe.py @@ -1,8 +1,8 @@ """Define the encrypted data-frame framework.""" import json -import zipfile from pathlib import Path from typing import Dict, Hashable, List, Optional, Sequence, Tuple, Union +from zipfile import ZIP_STORED, ZipFile import numpy import pandas @@ -327,7 +327,7 @@ def save(self, path: Union[Path, str]): encrypted_df_json_bytes = json.dumps(encrypted_df_dict).encode(encoding="utf-8") - with zipfile.ZipFile(path, "w") as zip_file: + with ZipFile(path, "w", compression=ZIP_STORED, allowZip64=True) as zip_file: zip_file.writestr("encrypted_dataframe.json", encrypted_df_json_bytes) zip_file.writestr("evaluation_keys", evaluation_keys) @@ -346,7 +346,7 @@ def load(cls, path: Union[Path, str]): if path.suffix != ".zip": path = path.with_suffix(".zip") - with zipfile.ZipFile(path, "r") as zip_file: + with ZipFile(path, "r", compression=ZIP_STORED, allowZip64=True) as zip_file: with zip_file.open("encrypted_dataframe.json") as encrypted_df_json_file: encrypted_df_json_bytes = encrypted_df_json_file.read() encrypted_df_dict = json.loads(encrypted_df_json_bytes) From 6f138604a3579f811b32fa49591037a6ef3444ce Mon Sep 17 00:00:00 2001 From: Roman Bredehoft Date: Thu, 4 Apr 2024 11:46:56 +0200 Subject: [PATCH 3/3] chore: trigger ci