From b40dec408c87f1abf9c6fb83efb7c45fa986d521 Mon Sep 17 00:00:00 2001 From: Suha Sabi Hussain Date: Wed, 10 Jan 2024 09:50:49 -0500 Subject: [PATCH 01/16] Enable verbosity modifications in PyTorch module --- example/hook_functions.py | 2 +- fickling/polyglot.py | 19 +++++++++++-------- fickling/pytorch.py | 5 +++-- 3 files changed, 15 insertions(+), 11 deletions(-) diff --git a/example/hook_functions.py b/example/hook_functions.py index b91862d..a6ec808 100644 --- a/example/hook_functions.py +++ b/example/hook_functions.py @@ -7,7 +7,7 @@ # Set up global fickling hook fickling.always_check_safety() -# Eauivalent to fickling.hook.run_hook() +# Equivalent to fickling.hook.run_hook() # Fickling can check a pickle file for safety prior to running it test_list = [1, 2, 3] diff --git a/fickling/polyglot.py b/fickling/polyglot.py index 8cc76f1..0c7db25 100644 --- a/fickling/polyglot.py +++ b/fickling/polyglot.py @@ -175,7 +175,7 @@ def check_for_corruption(properties): return corrupted, reason -def identify_pytorch_file_format(file, print_properties=False): +def identify_pytorch_file_format(file, print_properties=False, print_results=False): """ We are intentionally matching the semantics of the PyTorch reference parsers. To be polyglot-aware, we show the file formats ranked by likelihood. @@ -215,16 +215,19 @@ def identify_pytorch_file_format(file, print_properties=False): print(reason) if len(formats) != 0: primary = formats[0] - print("Your file is most likely of this format: ", primary, "\n") + if print_results: + print("Your file is most likely of this format: ", primary, "\n") secondary = formats[1:] if len(secondary) != 0: - print("It is also possible that your file can be validly interpreted as: ", secondary) + if print_results: + print("It is also possible that your file can be validly interpreted as: ", secondary) else: - print( - """Your file may not be a PyTorch file. - No valid file formats were detected. - If this is a mistake, raise an issue on our GitHub.""" - ) + if print_results: + print( + """Your file may not be a PyTorch file. + No valid file formats were detected. + If this is a mistake, raise an issue on our GitHub.""" + ) return formats diff --git a/fickling/pytorch.py b/fickling/pytorch.py index 976a5bc..980d9b5 100644 --- a/fickling/pytorch.py +++ b/fickling/pytorch.py @@ -25,14 +25,15 @@ def __reduce__(self): class PyTorchModelWrapper: - def __init__(self, path: Path, force: bool = False): + def __init__(self, path: Path, force: bool = False, verbose: bool = False): self.path: Path = path self._pickled: Optional[Pickled] = None self.force: bool = force self._formats: Set[str] = set() + self.verbose: bool = verbose def validate_file_format(self): - self._formats = fickling.polyglot.identify_pytorch_file_format(self.path) + self._formats = fickling.polyglot.identify_pytorch_file_format(self.path, print_results=self.verbose) """ One option was to raise an error if PyTorch v1.3 was not found or if any of the TorchScript versions were found. From 9795a4b2274bd7c40c50cb2d6cdc6acef4a54608 Mon Sep 17 00:00:00 2001 From: Suha Sabi Hussain Date: Wed, 10 Jan 2024 10:06:22 -0500 Subject: [PATCH 02/16] Add new polyglot example --- example/README.md | 1 + example/identify_pytorch_file.py | 14 ++++++++++++++ fickling/polyglot.py | 12 ++++++------ fickling/pytorch.py | 2 +- 4 files changed, 22 insertions(+), 7 deletions(-) create mode 100644 example/identify_pytorch_file.py diff --git a/example/README.md b/example/README.md index 821e3ca..b87f47c 100644 --- a/example/README.md +++ b/example/README.md @@ -7,3 +7,4 @@ * [inject_pytorch.py](https://github.com/trailofbits/fickling/blob/master/example/inject_pytorch.py): Inject a model loaded from a PyTorch file with malicious code using fickling’s PyTorch module * [numpy_poc.py](https://github.com/trailofbits/fickling/blob/master/example/numpy_poc.py): Analyze a malicious payload passed to `numpy.load()` * [trace_binary.py](https://github.com/trailofbits/fickling/blob/master/example/trace_binary.py): Decompile a payload using the tracing module +* [identify_pytorch_file.py](https://github.com/trailofbits/fickling/blob/master/example/identify_pytorch_file.py): Determine the file format type of 2 different PyTorch files \ No newline at end of file diff --git a/example/identify_pytorch_file.py b/example/identify_pytorch_file.py new file mode 100644 index 0000000..924e09f --- /dev/null +++ b/example/identify_pytorch_file.py @@ -0,0 +1,14 @@ +import torch +import torchvision.models as models + +import fickling.polyglot as polyglot + +model = models.mobilenet_v2() +torch.save(model, "mobilenet.pth") + +print("Identifying PyTorch 1.3 file:") +potential_formats = polyglot.identify_pytorch_file_format("mobilenet.pth", print_results=True) + +torch.save(model, "legacy_mobilenet.pth", _use_new_zipfile_serialization=False) +print("Identifying PyTorch v0.1.10 file:") +potential_formats_legacy = polyglot.identify_pytorch_file_format("legacy_mobilenet.pth", print_results=True) diff --git a/fickling/polyglot.py b/fickling/polyglot.py index 0c7db25..c35fd82 100644 --- a/fickling/polyglot.py +++ b/fickling/polyglot.py @@ -32,7 +32,7 @@ def check_and_find_in_zip( - zip_path, file_name_or_extension, return_path=False, check_extension=False + zip_path, file_name_or_extension, return_path=False, check_extension=False ): """Check for a file in the zip and return its path or boolean if found.""" try: @@ -165,9 +165,9 @@ def check_for_corruption(properties): # We expect this to be expanded upon if properties["is_torch_zip"]: if ( - properties["has_model_json"] - and not properties["has_attribute_pkl"] - and not properties["has_constants_pkl"] + properties["has_model_json"] + and not properties["has_attribute_pkl"] + and not properties["has_constants_pkl"] ): corrupted = True reason = """Your file may be corrupted. It contained a @@ -239,7 +239,7 @@ def append_file(source_filename, destination_filename): return destination_filename -def make_zip_pickle_polyglot(zip_file, pickle_file, copy=False): +def make_zip_pickle_polyglot(zip_file, pickle_file): append_file(zip_file, pickle_file) @@ -293,7 +293,7 @@ def create_polyglot(first_file, second_file): print("The polyglot is contained in polyglot.mar.tar") if polyglot_found is False: print( - """Fickling was not able to create any polglots. + """Fickling was not able to create any polyglots. If you think this is a mistake, raise an issue on our GitHub.""" ) os.remove(temp_first_file) diff --git a/fickling/pytorch.py b/fickling/pytorch.py index 980d9b5..428a59b 100644 --- a/fickling/pytorch.py +++ b/fickling/pytorch.py @@ -116,7 +116,7 @@ def pickled(self) -> Pickled: return self._pickled def inject_payload( - self, payload: str, output_path: Path, injection: str = "all", overwrite: bool = False + self, payload: str, output_path: Path, injection: str = "all", overwrite: bool = False ) -> None: self.output_path = output_path if injection == "insertion": From 478d4421701a64eedf3263408f0d693e4043c6e7 Mon Sep 17 00:00:00 2001 From: Suha Sabi Hussain Date: Fri, 19 Jan 2024 17:40:49 -0500 Subject: [PATCH 03/16] Add creation test --- test/test_polyglot.py | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/test/test_polyglot.py b/test/test_polyglot.py index f090485..c1f927e 100644 --- a/test/test_polyglot.py +++ b/test/test_polyglot.py @@ -52,6 +52,10 @@ def setUp(self): self.filename_v1_3 = "model_v1_3.pth" torch.save(model, self.filename_v1_3) + # PyTorch v1.3 Dup (for testing) + self.filename_v1_3_dup = "model_v1_3_dup.pth" + torch.save(model, self.filename_v1_3_dup) + # PyTorch v0.1.10 (Stacked pickle files) self.filename_legacy_pickle = "model_legacy_pickle.pth" torch.save(model, self.filename_legacy_pickle, _use_new_zipfile_serialization=False) @@ -61,6 +65,10 @@ def setUp(self): self.filename_torchscript = "model_torchscript.pt" torch.jit.save(m, self.filename_torchscript) + # TorchScript v1.4 + self.filename_torchscript_dup = "model_torchscript_dup.pt" + torch.jit.save(m, self.filename_torchscript_dup) + # PyTorch v0.1.1 self.filename_legacy_tar = "model_legacy_tar.pth" create_pytorch_legacy_tar(self.filename_legacy_tar) @@ -70,6 +78,8 @@ def setUp(self): create_random_zip(self.zip_filename) prepend_random_string(self.zip_filename) + self.standard_torchscript_polyglot_name = "test_polyglot.pt" + def tearDown(self): for filename in [ self.filename_v1_3, @@ -77,6 +87,9 @@ def tearDown(self): self.filename_torchscript, self.filename_legacy_tar, self.zip_filename, + self.filename_torchscript_dup, + self.filename_v1_3_dup, + self.standard_torchscript_polyglot_name ]: if os.path.exists(filename): os.remove(filename) @@ -166,3 +179,8 @@ def test_zip_properties(self): "has_attribute_pkl": False, } self.assertEqual(properties, proper_result) + + def test_create_standard_torchscript_polyglot(self): + polyglot.create_polyglot(self.filename_v1_3_dup, self.filename_torchscript_dup, self.standard_torchscript_polyglot_name, print_results=False) + formats = polyglot.identify_pytorch_file_format(self.standard_torchscript_polyglot_name) + self.assertTrue({"PyTorch v1.3", "TorchScript v1.4"}.issubset(formats)) From 67365c4fa072add3786b18fb5ee6948263d567db Mon Sep 17 00:00:00 2001 From: Suha Sabi Hussain Date: Fri, 19 Jan 2024 17:57:04 -0500 Subject: [PATCH 04/16] Refactor polyglot module for clarity --- fickling/polyglot.py | 160 ++++++++++++++++++++++++++++++------------ test/test_polyglot.py | 1 + 2 files changed, 116 insertions(+), 45 deletions(-) diff --git a/fickling/polyglot.py b/fickling/polyglot.py index c35fd82..093a421 100644 --- a/fickling/polyglot.py +++ b/fickling/polyglot.py @@ -19,7 +19,7 @@ • TorchScript v1.3: ZIP file with data.pkl and constants.pkl (2 pickle files) • TorchScript v1.4: ZIP file with data.pkl, constants.pkl, and version (2 pickle files and a folder) • PyTorch v1.3: ZIP file containing data.pkl (1 pickle file) -• PyTorch model archive format: ZIP file that includes Python code files and pickle files +• PyTorch model archive format[ZIP]: ZIP file that includes Python code files and pickle files Officially, PyTorch v0.1.1 and TorchScript < v1.4 are deprecated. However, they are still supported by some legacy parsers @@ -71,14 +71,14 @@ def check_pickle(file): def find_file_properties(file_path, print_properties=False): - """For a more granular analysis, we separate property discover and format identification""" + """For a more granular analysis, we separate property discovery and format identification""" properties = {} with open(file_path, "rb") as file: # PyTorch's torch.load() enforces a specific magic number at offset 0 for ZIP is_torch_zip = _is_zipfile(file) properties["is_torch_zip"] = is_torch_zip - # This tarfile check has many false positivies. It is not a determinant of PyTorch v0.1.1. + # This tarfile check has many false positives. It is not a determinant of PyTorch v0.1.1. if sys.version_info >= (3, 9): is_tar = tarfile.is_tarfile(file) else: @@ -179,6 +179,8 @@ def identify_pytorch_file_format(file, print_properties=False, print_results=Fal """ We are intentionally matching the semantics of the PyTorch reference parsers. To be polyglot-aware, we show the file formats ranked by likelihood. + Our parsing depth is at the file structure level; + However, it can be at the full parsing level if necessary. """ properties = find_file_properties(file, print_properties) formats = [] @@ -239,11 +241,60 @@ def append_file(source_filename, destination_filename): return destination_filename -def make_zip_pickle_polyglot(zip_file, pickle_file): +def create_zip_pickle_polyglot(zip_file, pickle_file): append_file(zip_file, pickle_file) -def create_polyglot(first_file, second_file): +def create_mar_legacy_pickle_polyglot(files, print_results=False, polyglot_file_name="polyglot.mar.pt"): + files.sort(key=lambda x: x[1] != "PyTorch model archive format") + if print_results: + print("Making a PyTorch MAR/PyTorch v0.1.10 polyglot") + polyglot_file = append_file(*[file[0] for file in files]) + shutil.copy(polyglot_file, polyglot_file_name) + polyglot_found = True + return polyglot_found + + +def create_standard_torchscript_polyglot(files, print_results=False, polyglot_file_name="polyglot.pt"): + if print_results: + print("Making a PyTorch v1.3/TorchScript v1.4 polyglot") + print("Warning: For some parsers, this may generate polymocks instead of polyglots.") + standard_pytorch_file = [file[0] for file in files if file[1] == "PyTorch v1.3"][0] + torchscript_file = [file[0] for file in files if file[1] == "TorchScript v1.4"][0] + if polyglot_file_name is None: + polyglot_file_name = "polyglot.pt" + shutil.copy(standard_pytorch_file, polyglot_file_name) + + with zipfile.ZipFile(torchscript_file, "r") as zip_b: + constants_pkl_path = check_and_find_in_zip( + zip_b, "constants.pkl", check_extension=False, return_path=True + ) + version_path = check_and_find_in_zip(zip_b, "version", return_path=True) + if constants_pkl_path and version_path: + zip_b.extract(constants_pkl_path, "temp") + zip_b.extract(version_path, "temp") + + with zipfile.ZipFile(polyglot_file_name, "a") as zip_out: + zip_out.write(f"temp/{constants_pkl_path}", "constants.pkl") + zip_out.write(f"temp/{version_path}", "version") + + shutil.rmtree("temp") + polyglot_found = True + return polyglot_found + +def create_mar_legacy_tar_polyglot(files, print_results=False, polyglot_file_name="polyglot.mar.tar"): + if print_results: + print("Making a PyTorch v0.1.1/PyTorch MAR polyglot") + mar_file = [file[0] for file in files if file[1] == "PyTorch model archive format"][0] + tar_file = [file[0] for file in files if file[1] == "PyTorch v0.1.1"][0] + polyglot_file = append_file(mar_file, tar_file) + shutil.copy(polyglot_file, polyglot_file_name) + polyglot_found = True + return polyglot_found + + +def create_polyglot(first_file, second_file, polyglot_file_name=None, print_results=True): + polyglot_found = False temp_first_file = "temp_" + os.path.basename(first_file) temp_second_file = "temp_" + os.path.basename(second_file) shutil.copy(first_file, temp_first_file) @@ -253,49 +304,68 @@ def create_polyglot(first_file, second_file): (temp_second_file, identify_pytorch_file_format(temp_second_file)[0]), ] formats = set(map(lambda x: x[1], files)) # noqa - polyglot_found = False if {"PyTorch model archive format", "PyTorch v0.1.10"}.issubset(formats): - files.sort(key=lambda x: x[1] != "PyTorch model archive format") - print("Making a PyTorch MAR/PyTorch v0.1.10 polyglot") - polyglot_found = True - polyglot_file = append_file(*[file[0] for file in files]) - shutil.copy(polyglot_file, "polyglot.mar.pt") - print("The polyglot is contained in polyglot.mar.pt") + if polyglot_file_name is None: + polyglot_file_name = "polyglot.mar.pt" + polyglot_found = create_mar_legacy_pickle_polyglot(files, print_results, polyglot_file_name) + # files.sort(key=lambda x: x[1] != "PyTorch model archive format") + # if print_results: + # print("Making a PyTorch MAR/PyTorch v0.1.10 polyglot") + # polyglot_found = True + # polyglot_file = append_file(*[file[0] for file in files]) + # if polyglot_file_name is None: + # polyglot_file_name = "polyglot.mar.pt" + # shutil.copy(polyglot_file, polyglot_file_name) if {"PyTorch v1.3", "TorchScript v1.4"}.issubset(formats): - print("Making a PyTorch v1.3/TorchScript v1.4 polyglot") - print("Warning: For some parsers, this may generate polymocks instead of polyglots.") - polyglot_found = True - standard_pytorch_file = [file[0] for file in files if file[1] == "PyTorch v1.3"][0] - torchscript_file = [file[0] for file in files if file[1] == "TorchScript v1.4"][0] - shutil.copy(standard_pytorch_file, "polyglot.pt") - - with zipfile.ZipFile(torchscript_file, "r") as zip_b: - constants_pkl_path = check_and_find_in_zip( - zip_b, "constants.pkl", check_extension=False, return_path=True - ) - version_path = check_and_find_in_zip(zip_b, "version", return_path=True) - if constants_pkl_path and version_path: - zip_b.extract(constants_pkl_path, "temp") - zip_b.extract(version_path, "temp") - - with zipfile.ZipFile("polyglot.pt", "a") as zip_out: - zip_out.write(f"temp/{constants_pkl_path}", "constants.pkl") - zip_out.write(f"temp/{version_path}", "version") - - shutil.rmtree("temp") + if polyglot_file_name is None: + polyglot_file_name = "polyglot.pt" + polyglot_found = create_standard_torchscript_polyglot(files, print_results, polyglot_file_name) + # if print_results: + # print("Making a PyTorch v1.3/TorchScript v1.4 polyglot") + # print("Warning: For some parsers, this may generate polymocks instead of polyglots.") + # polyglot_found = True + # standard_pytorch_file = [file[0] for file in files if file[1] == "PyTorch v1.3"][0] + # torchscript_file = [file[0] for file in files if file[1] == "TorchScript v1.4"][0] + # if polyglot_file_name is None: + # polyglot_file_name = "polyglot.pt" + # shutil.copy(standard_pytorch_file, polyglot_file_name) + # + # with zipfile.ZipFile(torchscript_file, "r") as zip_b: + # constants_pkl_path = check_and_find_in_zip( + # zip_b, "constants.pkl", check_extension=False, return_path=True + # ) + # version_path = check_and_find_in_zip(zip_b, "version", return_path=True) + # if constants_pkl_path and version_path: + # zip_b.extract(constants_pkl_path, "temp") + # zip_b.extract(version_path, "temp") + # + # with zipfile.ZipFile(polyglot_file_name, "a") as zip_out: + # zip_out.write(f"temp/{constants_pkl_path}", "constants.pkl") + # zip_out.write(f"temp/{version_path}", "version") + # + # shutil.rmtree("temp") if {"PyTorch model archive format", "PyTorch v0.1.1"}.issubset(formats): - print("Making a PyTorch v0.1.1/PyTorch MAR polyglot") - polyglot_found = True - mar_file = [file[0] for file in files if file[1] == "PyTorch model archive format"][0] - tar_file = [file[0] for file in files if file[1] == "PyTorch v0.1.1"][0] - polyglot_file = append_file(mar_file, tar_file) - shutil.copy(polyglot_file, "polyglot.mar.tar") - print("The polyglot is contained in polyglot.mar.tar") - if polyglot_found is False: - print( - """Fickling was not able to create any polyglots. - If you think this is a mistake, raise an issue on our GitHub.""" - ) + if polyglot_file_name is None: + polyglot_file_name = "polyglot.mar.tar" + polyglot_found = create_mar_legacy_tar_polyglot(files, print_results, polyglot_file_name) + # if print_results: + # print("Making a PyTorch v0.1.1/PyTorch MAR polyglot") + # polyglot_found = True + # mar_file = [file[0] for file in files if file[1] == "PyTorch model archive format"][0] + # tar_file = [file[0] for file in files if file[1] == "PyTorch v0.1.1"][0] + # polyglot_file = append_file(mar_file, tar_file) + # if polyglot_file_name is None: + # polyglot_file_name = "polyglot.mar.tar" + # shutil.copy(polyglot_file, "polyglot.mar.tar") + + if print_results: + if polyglot_found is False: + print( + """Fickling was not able to create any polyglots. + If you think this is a mistake, raise an issue on our GitHub.""" + ) + else: + print(f"The polyglot is contained in {polyglot_file_name}") os.remove(temp_first_file) os.remove(temp_second_file) return polyglot_found diff --git a/test/test_polyglot.py b/test/test_polyglot.py index c1f927e..9deb535 100644 --- a/test/test_polyglot.py +++ b/test/test_polyglot.py @@ -184,3 +184,4 @@ def test_create_standard_torchscript_polyglot(self): polyglot.create_polyglot(self.filename_v1_3_dup, self.filename_torchscript_dup, self.standard_torchscript_polyglot_name, print_results=False) formats = polyglot.identify_pytorch_file_format(self.standard_torchscript_polyglot_name) self.assertTrue({"PyTorch v1.3", "TorchScript v1.4"}.issubset(formats)) + From 80efdfe8f2e78dd28cf24fc5a30b73bee29993e0 Mon Sep 17 00:00:00 2001 From: Suha Sabi Hussain Date: Fri, 19 Jan 2024 18:23:09 -0500 Subject: [PATCH 05/16] Fix typos and clean up comments --- fickling/cli.py | 2 +- fickling/fickle.py | 2 +- fickling/polyglot.py | 42 ------------------------------------------ fickling/pytorch.py | 4 ++-- 4 files changed, 4 insertions(+), 46 deletions(-) diff --git a/fickling/cli.py b/fickling/cli.py index 0391876..0aed818 100644 --- a/fickling/cli.py +++ b/fickling/cli.py @@ -35,7 +35,7 @@ def main(argv: Optional[List[str]] = None) -> int: "-i", type=str, default=None, - help="inject the specified Python code to be run at the end of depickling, " + help="inject the specified Python code to be run at the end of unpickling, " "and output the resulting pickle data", ) parser.add_argument( diff --git a/fickling/fickle.py b/fickling/fickle.py index ee51ed4..b1d0e00 100644 --- a/fickling/fickle.py +++ b/fickling/fickle.py @@ -508,7 +508,7 @@ def append_python( def insert_magic_int(self, magic: int, index: int = -1): """Insert and pop a specific integer value. This is used for persistent injections to locate the injection payload in the pickled file. The value - is artificially added by using an dummy INT + POP combination that doesn't + is artificially added by using a dummy INT + POP combination that doesn't affect the stack when executed :param magic: magic integer value to add diff --git a/fickling/polyglot.py b/fickling/polyglot.py index 093a421..5467205 100644 --- a/fickling/polyglot.py +++ b/fickling/polyglot.py @@ -308,56 +308,14 @@ def create_polyglot(first_file, second_file, polyglot_file_name=None, print_resu if polyglot_file_name is None: polyglot_file_name = "polyglot.mar.pt" polyglot_found = create_mar_legacy_pickle_polyglot(files, print_results, polyglot_file_name) - # files.sort(key=lambda x: x[1] != "PyTorch model archive format") - # if print_results: - # print("Making a PyTorch MAR/PyTorch v0.1.10 polyglot") - # polyglot_found = True - # polyglot_file = append_file(*[file[0] for file in files]) - # if polyglot_file_name is None: - # polyglot_file_name = "polyglot.mar.pt" - # shutil.copy(polyglot_file, polyglot_file_name) if {"PyTorch v1.3", "TorchScript v1.4"}.issubset(formats): if polyglot_file_name is None: polyglot_file_name = "polyglot.pt" polyglot_found = create_standard_torchscript_polyglot(files, print_results, polyglot_file_name) - # if print_results: - # print("Making a PyTorch v1.3/TorchScript v1.4 polyglot") - # print("Warning: For some parsers, this may generate polymocks instead of polyglots.") - # polyglot_found = True - # standard_pytorch_file = [file[0] for file in files if file[1] == "PyTorch v1.3"][0] - # torchscript_file = [file[0] for file in files if file[1] == "TorchScript v1.4"][0] - # if polyglot_file_name is None: - # polyglot_file_name = "polyglot.pt" - # shutil.copy(standard_pytorch_file, polyglot_file_name) - # - # with zipfile.ZipFile(torchscript_file, "r") as zip_b: - # constants_pkl_path = check_and_find_in_zip( - # zip_b, "constants.pkl", check_extension=False, return_path=True - # ) - # version_path = check_and_find_in_zip(zip_b, "version", return_path=True) - # if constants_pkl_path and version_path: - # zip_b.extract(constants_pkl_path, "temp") - # zip_b.extract(version_path, "temp") - # - # with zipfile.ZipFile(polyglot_file_name, "a") as zip_out: - # zip_out.write(f"temp/{constants_pkl_path}", "constants.pkl") - # zip_out.write(f"temp/{version_path}", "version") - # - # shutil.rmtree("temp") if {"PyTorch model archive format", "PyTorch v0.1.1"}.issubset(formats): if polyglot_file_name is None: polyglot_file_name = "polyglot.mar.tar" polyglot_found = create_mar_legacy_tar_polyglot(files, print_results, polyglot_file_name) - # if print_results: - # print("Making a PyTorch v0.1.1/PyTorch MAR polyglot") - # polyglot_found = True - # mar_file = [file[0] for file in files if file[1] == "PyTorch model archive format"][0] - # tar_file = [file[0] for file in files if file[1] == "PyTorch v0.1.1"][0] - # polyglot_file = append_file(mar_file, tar_file) - # if polyglot_file_name is None: - # polyglot_file_name = "polyglot.mar.tar" - # shutil.copy(polyglot_file, "polyglot.mar.tar") - if print_results: if polyglot_found is False: print( diff --git a/fickling/pytorch.py b/fickling/pytorch.py index 428a59b..e650c3a 100644 --- a/fickling/pytorch.py +++ b/fickling/pytorch.py @@ -118,7 +118,7 @@ def pickled(self) -> Pickled: def inject_payload( self, payload: str, output_path: Path, injection: str = "all", overwrite: bool = False ) -> None: - self.output_path = output_path + output_path = output_path if injection == "insertion": # This does NOT bypass the weights based unpickler pickled = self.pickled @@ -140,6 +140,6 @@ def inject_payload( if overwrite is True: # Rename the new file to replace the original file Path(output_path).rename(self.path) - output_path = Path(self.output_path) + output_path = Path(output_path) if output_path.exists(): os.remove(output_path) From 46ad4dd4d3cd7ca2d5f079ac55bd8032131a37f6 Mon Sep 17 00:00:00 2001 From: Suha Sabi Hussain Date: Thu, 25 Jan 2024 08:39:35 -0500 Subject: [PATCH 06/16] Refine support for TorchScript in the PyTorch module --- fickling/pytorch.py | 51 +++++++++++++++++++++++++++----------------- test/test_pytorch.py | 29 ++++++++++++++++++++++++- 2 files changed, 59 insertions(+), 21 deletions(-) diff --git a/fickling/pytorch.py b/fickling/pytorch.py index e650c3a..f91a690 100644 --- a/fickling/pytorch.py +++ b/fickling/pytorch.py @@ -9,7 +9,6 @@ import fickling.polyglot from fickling.fickle import Pickled - class BaseInjection(torch.nn.Module): # This class allows you to combine the payload and original model def __init__(self, original_model: torch.nn.Module, payload: str): @@ -25,22 +24,20 @@ def __reduce__(self): class PyTorchModelWrapper: - def __init__(self, path: Path, force: bool = False, verbose: bool = False): + def __init__(self, path: Path, force: bool = False): self.path: Path = path self._pickled: Optional[Pickled] = None self.force: bool = force self._formats: Set[str] = set() - self.verbose: bool = verbose def validate_file_format(self): - self._formats = fickling.polyglot.identify_pytorch_file_format(self.path, print_results=self.verbose) - """ - One option was to raise an error if PyTorch v1.3 was not found - or if any of the TorchScript versions were found. - However, that would prevent polyglots from being loaded. - Therefore, the 'force' argument was created to enable users to do that if needed. - Another option was to warn only if "PyTorch v1.3" was not the most likely format. - Instead, the file formats are directly specified for clarity and independence. + self._formats = fickling.polyglot.identify_pytorch_file_format(self.path) + """ + PyTorch v1.3 and TorchScript v1.4 are explicitly supported by PyTorchModelWrapper. + This class may work on other file formats depending on its construction. + To enable users to check that and load polyglots, the force argument exists. + There is a warning for TorchScript v1.4 because of the scripting/tracing/mixing edge cases. + Technically, injections applied to that format should not work torch.jit.load() but may work on torch.load(). """ if len(self._formats) == 0: if self.force is True: @@ -58,12 +55,14 @@ def validate_file_format(self): If it is a PyTorch file, raise an issue on GitHub. """ ) - if ("PyTorch v1.3" not in self._formats) or { - "TorchScript v1.4", - "TorchScript v1.3", - "TorchScript v1.1", - "TorchScript v1.0", - }.intersection(self._formats): + # if (("PyTorch v1.3" not in self._formats) or + # { + # "TorchScript v1.4", + # "TorchScript v1.3", + # "TorchScript v1.1", + # "TorchScript v1.0", + # }.intersection(self._formats)): + if ("PyTorch v1.3" not in self._formats) and ("TorchScript v1.4" not in self._formats): if "PyTorch v0.1.10" in self._formats: if self.force is True: warnings.warn( @@ -93,6 +92,12 @@ def validate_file_format(self): """A fickling wrapper and injection method does not exist for that format. Please raise an issue on our GitHub or use the argument `force=True`.""" ) + if self._formats[0] == "TorchScript v1.4": + warnings.warn( + """Support for TorchScript v1.4 files is experimental. The effectiveness of the capabilities of + PyTorchModelWrapper, especially injection, is highly dependent on the construction of the model.""", + UserWarning, + ) return self._formats @property @@ -116,9 +121,15 @@ def pickled(self) -> Pickled: return self._pickled def inject_payload( - self, payload: str, output_path: Path, injection: str = "all", overwrite: bool = False + self, payload: str, output_path: Path, injection: str = "all", overwrite: bool = False ) -> None: - output_path = output_path + self.output_path = output_path + if self.formats[0] == "TorchScript v1.4": + warnings.warn( + """Support for TorchScript v1.4 files is experimental. Injections may not be effective + depending on the construction of the model and the target parser.""", + UserWarning, + ) if injection == "insertion": # This does NOT bypass the weights based unpickler pickled = self.pickled @@ -140,6 +151,6 @@ def inject_payload( if overwrite is True: # Rename the new file to replace the original file Path(output_path).rename(self.path) - output_path = Path(output_path) + output_path = Path(self.output_path) if output_path.exists(): os.remove(output_path) diff --git a/test/test_pytorch.py b/test/test_pytorch.py index e0eeae4..181197c 100644 --- a/test/test_pytorch.py +++ b/test/test_pytorch.py @@ -14,9 +14,12 @@ def setUp(self): self.filename_v1_3 = "test_model.pth" torch.save(model, self.filename_v1_3) self.zip_filename = "test_random_data.zip" + m = torch.jit.script(model) + self.torchscript_filename = "test_model_torchscript.pth" + torch.jit.save(m, self.torchscript_filename) def tearDown(self): - for filename in [self.filename_v1_3, self.zip_filename]: + for filename in [self.filename_v1_3, self.zip_filename, self.torchscript_filename]: if os.path.exists(filename): os.remove(filename) @@ -25,12 +28,23 @@ def test_wrapper(self): PyTorchModelWrapper(self.filename_v1_3) except Exception as e: # noqa self.fail(f"PyTorchModelWrapper was not able to load a PyTorch v1.3 file: {e}") + def test_torchscript_wrapper(self): + try: + PyTorchModelWrapper(self.torchscript_filename) + except Exception as e: # noqa + self.fail(f"PyTorchModelWrapper was not able to load a TorchScript v1.4 file: {e}") def test_pickled(self): result = PyTorchModelWrapper(self.filename_v1_3) pickled_portion = result.pickled self.assertIsInstance(pickled_portion, Pickled) + def test_torchscript_pickled(self): + result = PyTorchModelWrapper(self.torchscript_filename) + pickled_portion = result.pickled + self.assertIsInstance(pickled_portion, Pickled) + + def test_injection_insertion(self): try: result = PyTorchModelWrapper(self.filename_v1_3) @@ -43,6 +57,19 @@ def test_injection_insertion(self): f"PyTorchModelWrapper was not able to inject code into a PyTorch v1.3 file: {e}" ) + def test_injection_insertion(self): + try: + result = PyTorchModelWrapper(self.filename_v1_3) + temp_filename = "temp_filename" + result.inject_payload("print('Hello, World!')", temp_filename, injection="insertion") + if os.path.exists(temp_filename): + os.remove(temp_filename) + except Exception as e: # noqa + self.fail( + f"PyTorchModelWrapper was not able to inject code into a PyTorch v1.3 file: {e}" + ) + + def test_injection_combination(self): try: result = PyTorchModelWrapper(self.filename_v1_3) From ef6c0cc085a84ee1b4f174d0ce379f55c314dcfa Mon Sep 17 00:00:00 2001 From: Suha Sabi Hussain Date: Thu, 25 Jan 2024 08:44:30 -0500 Subject: [PATCH 07/16] Lint --- example/identify_pytorch_file.py | 4 +++- fickling/polyglot.py | 31 +++++++++++++++++++++---------- fickling/pytorch.py | 16 ++++++++-------- test/test_polyglot.py | 10 +++++++--- test/test_pytorch.py | 15 +-------------- 5 files changed, 40 insertions(+), 36 deletions(-) diff --git a/example/identify_pytorch_file.py b/example/identify_pytorch_file.py index 924e09f..7e6ed04 100644 --- a/example/identify_pytorch_file.py +++ b/example/identify_pytorch_file.py @@ -11,4 +11,6 @@ torch.save(model, "legacy_mobilenet.pth", _use_new_zipfile_serialization=False) print("Identifying PyTorch v0.1.10 file:") -potential_formats_legacy = polyglot.identify_pytorch_file_format("legacy_mobilenet.pth", print_results=True) +potential_formats_legacy = polyglot.identify_pytorch_file_format( + "legacy_mobilenet.pth", print_results=True +) diff --git a/fickling/polyglot.py b/fickling/polyglot.py index 5467205..2696deb 100644 --- a/fickling/polyglot.py +++ b/fickling/polyglot.py @@ -32,7 +32,7 @@ def check_and_find_in_zip( - zip_path, file_name_or_extension, return_path=False, check_extension=False + zip_path, file_name_or_extension, return_path=False, check_extension=False ): """Check for a file in the zip and return its path or boolean if found.""" try: @@ -165,9 +165,9 @@ def check_for_corruption(properties): # We expect this to be expanded upon if properties["is_torch_zip"]: if ( - properties["has_model_json"] - and not properties["has_attribute_pkl"] - and not properties["has_constants_pkl"] + properties["has_model_json"] + and not properties["has_attribute_pkl"] + and not properties["has_constants_pkl"] ): corrupted = True reason = """Your file may be corrupted. It contained a @@ -222,7 +222,9 @@ def identify_pytorch_file_format(file, print_properties=False, print_results=Fal secondary = formats[1:] if len(secondary) != 0: if print_results: - print("It is also possible that your file can be validly interpreted as: ", secondary) + print( + "It is also possible that your file can be validly interpreted as: ", secondary + ) else: if print_results: print( @@ -245,7 +247,9 @@ def create_zip_pickle_polyglot(zip_file, pickle_file): append_file(zip_file, pickle_file) -def create_mar_legacy_pickle_polyglot(files, print_results=False, polyglot_file_name="polyglot.mar.pt"): +def create_mar_legacy_pickle_polyglot( + files, print_results=False, polyglot_file_name="polyglot.mar.pt" +): files.sort(key=lambda x: x[1] != "PyTorch model archive format") if print_results: print("Making a PyTorch MAR/PyTorch v0.1.10 polyglot") @@ -255,7 +259,9 @@ def create_mar_legacy_pickle_polyglot(files, print_results=False, polyglot_file_ return polyglot_found -def create_standard_torchscript_polyglot(files, print_results=False, polyglot_file_name="polyglot.pt"): +def create_standard_torchscript_polyglot( + files, print_results=False, polyglot_file_name="polyglot.pt" +): if print_results: print("Making a PyTorch v1.3/TorchScript v1.4 polyglot") print("Warning: For some parsers, this may generate polymocks instead of polyglots.") @@ -282,9 +288,12 @@ def create_standard_torchscript_polyglot(files, print_results=False, polyglot_fi polyglot_found = True return polyglot_found -def create_mar_legacy_tar_polyglot(files, print_results=False, polyglot_file_name="polyglot.mar.tar"): + +def create_mar_legacy_tar_polyglot( + files, print_results=False, polyglot_file_name="polyglot.mar.tar" +): if print_results: - print("Making a PyTorch v0.1.1/PyTorch MAR polyglot") + print("Making a PyTorch v0.1.1/PyTorch MAR polyglot") mar_file = [file[0] for file in files if file[1] == "PyTorch model archive format"][0] tar_file = [file[0] for file in files if file[1] == "PyTorch v0.1.1"][0] polyglot_file = append_file(mar_file, tar_file) @@ -311,7 +320,9 @@ def create_polyglot(first_file, second_file, polyglot_file_name=None, print_resu if {"PyTorch v1.3", "TorchScript v1.4"}.issubset(formats): if polyglot_file_name is None: polyglot_file_name = "polyglot.pt" - polyglot_found = create_standard_torchscript_polyglot(files, print_results, polyglot_file_name) + polyglot_found = create_standard_torchscript_polyglot( + files, print_results, polyglot_file_name + ) if {"PyTorch model archive format", "PyTorch v0.1.1"}.issubset(formats): if polyglot_file_name is None: polyglot_file_name = "polyglot.mar.tar" diff --git a/fickling/pytorch.py b/fickling/pytorch.py index f91a690..50fdbb2 100644 --- a/fickling/pytorch.py +++ b/fickling/pytorch.py @@ -9,6 +9,7 @@ import fickling.polyglot from fickling.fickle import Pickled + class BaseInjection(torch.nn.Module): # This class allows you to combine the payload and original model def __init__(self, original_model: torch.nn.Module, payload: str): @@ -32,12 +33,12 @@ def __init__(self, path: Path, force: bool = False): def validate_file_format(self): self._formats = fickling.polyglot.identify_pytorch_file_format(self.path) - """ + """ PyTorch v1.3 and TorchScript v1.4 are explicitly supported by PyTorchModelWrapper. This class may work on other file formats depending on its construction. - To enable users to check that and load polyglots, the force argument exists. - There is a warning for TorchScript v1.4 because of the scripting/tracing/mixing edge cases. - Technically, injections applied to that format should not work torch.jit.load() but may work on torch.load(). + To enable users to check that and load polyglots, the force argument exists. + There is a warning for TorchScript v1.4 because of the scripting/tracing/mixing edge cases. + For example, an injection may work on torch.load() but not torch.jit.load(). """ if len(self._formats) == 0: if self.force is True: @@ -94,8 +95,7 @@ def validate_file_format(self): ) if self._formats[0] == "TorchScript v1.4": warnings.warn( - """Support for TorchScript v1.4 files is experimental. The effectiveness of the capabilities of - PyTorchModelWrapper, especially injection, is highly dependent on the construction of the model.""", + """Support for TorchScript v1.4 files is experimental.""", UserWarning, ) return self._formats @@ -126,8 +126,8 @@ def inject_payload( self.output_path = output_path if self.formats[0] == "TorchScript v1.4": warnings.warn( - """Support for TorchScript v1.4 files is experimental. Injections may not be effective - depending on the construction of the model and the target parser.""", + """Support for TorchScript v1.4 files is experimental. + Injections may not be effective depending on the model and the target parser.""", UserWarning, ) if injection == "insertion": diff --git a/test/test_polyglot.py b/test/test_polyglot.py index 9deb535..f0658fb 100644 --- a/test/test_polyglot.py +++ b/test/test_polyglot.py @@ -89,7 +89,7 @@ def tearDown(self): self.zip_filename, self.filename_torchscript_dup, self.filename_v1_3_dup, - self.standard_torchscript_polyglot_name + self.standard_torchscript_polyglot_name, ]: if os.path.exists(filename): os.remove(filename) @@ -181,7 +181,11 @@ def test_zip_properties(self): self.assertEqual(properties, proper_result) def test_create_standard_torchscript_polyglot(self): - polyglot.create_polyglot(self.filename_v1_3_dup, self.filename_torchscript_dup, self.standard_torchscript_polyglot_name, print_results=False) + polyglot.create_polyglot( + self.filename_v1_3_dup, + self.filename_torchscript_dup, + self.standard_torchscript_polyglot_name, + print_results=False, + ) formats = polyglot.identify_pytorch_file_format(self.standard_torchscript_polyglot_name) self.assertTrue({"PyTorch v1.3", "TorchScript v1.4"}.issubset(formats)) - diff --git a/test/test_pytorch.py b/test/test_pytorch.py index 181197c..fcb8746 100644 --- a/test/test_pytorch.py +++ b/test/test_pytorch.py @@ -28,6 +28,7 @@ def test_wrapper(self): PyTorchModelWrapper(self.filename_v1_3) except Exception as e: # noqa self.fail(f"PyTorchModelWrapper was not able to load a PyTorch v1.3 file: {e}") + def test_torchscript_wrapper(self): try: PyTorchModelWrapper(self.torchscript_filename) @@ -44,7 +45,6 @@ def test_torchscript_pickled(self): pickled_portion = result.pickled self.assertIsInstance(pickled_portion, Pickled) - def test_injection_insertion(self): try: result = PyTorchModelWrapper(self.filename_v1_3) @@ -57,19 +57,6 @@ def test_injection_insertion(self): f"PyTorchModelWrapper was not able to inject code into a PyTorch v1.3 file: {e}" ) - def test_injection_insertion(self): - try: - result = PyTorchModelWrapper(self.filename_v1_3) - temp_filename = "temp_filename" - result.inject_payload("print('Hello, World!')", temp_filename, injection="insertion") - if os.path.exists(temp_filename): - os.remove(temp_filename) - except Exception as e: # noqa - self.fail( - f"PyTorchModelWrapper was not able to inject code into a PyTorch v1.3 file: {e}" - ) - - def test_injection_combination(self): try: result = PyTorchModelWrapper(self.filename_v1_3) From 8089cbe54630530be153c436ffee0875e1760388 Mon Sep 17 00:00:00 2001 From: Suha Sabi Hussain Date: Thu, 25 Jan 2024 08:52:52 -0500 Subject: [PATCH 08/16] Fix comment and file name --- README.md | 2 +- example/identify_pytorch_file.py | 4 ++-- safe.pkl | Bin 0 -> 22 bytes unsafe.pickle | Bin 0 -> 83 bytes 4 files changed, 3 insertions(+), 3 deletions(-) create mode 100644 safe.pkl create mode 100644 unsafe.pickle diff --git a/README.md b/README.md index 8cc46bb..2a5fa41 100644 --- a/README.md +++ b/README.md @@ -172,7 +172,7 @@ following PyTorch file formats: * **TorchScript v1.3**: ZIP file with data.pkl and constants.pkl (2 pickle files) * **TorchScript v1.4**: ZIP file with data.pkl, constants.pkl, and version (2 pickle files and a folder) * **PyTorch v1.3**: ZIP file containing data.pkl (1 pickle file) -* **PyTorch model archive format**: ZIP file that includes Python code files and pickle files +* **PyTorch model archive format[ZIP]**: ZIP file that includes Python code files and pickle files ```python >> import torch diff --git a/example/identify_pytorch_file.py b/example/identify_pytorch_file.py index 7e6ed04..bba42b8 100644 --- a/example/identify_pytorch_file.py +++ b/example/identify_pytorch_file.py @@ -5,11 +5,11 @@ model = models.mobilenet_v2() torch.save(model, "mobilenet.pth") +torch.save(model, "legacy_mobilenet.pth", _use_new_zipfile_serialization=False) -print("Identifying PyTorch 1.3 file:") +print("Identifying PyTorch v1.3 file:") potential_formats = polyglot.identify_pytorch_file_format("mobilenet.pth", print_results=True) -torch.save(model, "legacy_mobilenet.pth", _use_new_zipfile_serialization=False) print("Identifying PyTorch v0.1.10 file:") potential_formats_legacy = polyglot.identify_pytorch_file_format( "legacy_mobilenet.pth", print_results=True diff --git a/safe.pkl b/safe.pkl new file mode 100644 index 0000000000000000000000000000000000000000..1c6c4ccab0cee0d58b978c1952d2b9bccf2213a3 GIT binary patch literal 22 YcmZo*naa%o0kKmwycxZjyqQz=04ZhzxBvhE literal 0 HcmV?d00001 diff --git a/unsafe.pickle b/unsafe.pickle new file mode 100644 index 0000000000000000000000000000000000000000..f9e7047eb188dcde3a9ae636b99f90bed7ebd8bb GIT binary patch literal 83 zcmZo*nd-p+0X?h*`Nf$PQ+n8nD~n4~bEiz6(xaQ2oRP1f?x|3mkzbmVqL7hTma34H inwqCjT#{ccLYaI0e literal 0 HcmV?d00001 From 5a95232b9bc267009442337936b9e34417efd876 Mon Sep 17 00:00:00 2001 From: Suha Sabi Hussain Date: Thu, 25 Jan 2024 08:55:24 -0500 Subject: [PATCH 09/16] Remove unnecessary lines --- fickling/pytorch.py | 7 ------- 1 file changed, 7 deletions(-) diff --git a/fickling/pytorch.py b/fickling/pytorch.py index 50fdbb2..d8f77f2 100644 --- a/fickling/pytorch.py +++ b/fickling/pytorch.py @@ -56,13 +56,6 @@ def validate_file_format(self): If it is a PyTorch file, raise an issue on GitHub. """ ) - # if (("PyTorch v1.3" not in self._formats) or - # { - # "TorchScript v1.4", - # "TorchScript v1.3", - # "TorchScript v1.1", - # "TorchScript v1.0", - # }.intersection(self._formats)): if ("PyTorch v1.3" not in self._formats) and ("TorchScript v1.4" not in self._formats): if "PyTorch v0.1.10" in self._formats: if self.force is True: From 0835462b618a74f2a18bf9b4e6f87aad0ce8beaf Mon Sep 17 00:00:00 2001 From: Suha Sabi Hussain Date: Thu, 25 Jan 2024 09:00:43 -0500 Subject: [PATCH 10/16] Clean up --- example/README.md | 4 ++-- safe.pkl | Bin 22 -> 0 bytes unsafe.pickle | Bin 83 -> 0 bytes 3 files changed, 2 insertions(+), 2 deletions(-) delete mode 100644 safe.pkl delete mode 100644 unsafe.pickle diff --git a/example/README.md b/example/README.md index b87f47c..aee4390 100644 --- a/example/README.md +++ b/example/README.md @@ -4,7 +4,7 @@ * [context_manager.py](https://github.com/trailofbits/fickling/blob/master/example/context_manager.py): Halt the deserialization of a malicious pickle file with the fickling context manager * [fault_injection.py](https://github.com/trailofbits/fickling/blob/master/example/fault_injection.py): Perform a fault injection on a PyTorch model and then analyze the result with `check_safety` * [inject_mobilenet.py](https://github.com/trailofbits/fickling/blob/master/example/inject_mobilenet.py): Override the `eval` method of a ML model using fickling and apply `fickling.is_likely_safe` to the model file -* [inject_pytorch.py](https://github.com/trailofbits/fickling/blob/master/example/inject_pytorch.py): Inject a model loaded from a PyTorch file with malicious code using fickling’s PyTorch module +* [inject_pytorch.py](https://github.com/trailofbits/fickling/blob/master/example/inject_pytorch.py): Inject a model loaded from a PyTorch file with malicious code using the PyTorch module * [numpy_poc.py](https://github.com/trailofbits/fickling/blob/master/example/numpy_poc.py): Analyze a malicious payload passed to `numpy.load()` * [trace_binary.py](https://github.com/trailofbits/fickling/blob/master/example/trace_binary.py): Decompile a payload using the tracing module -* [identify_pytorch_file.py](https://github.com/trailofbits/fickling/blob/master/example/identify_pytorch_file.py): Determine the file format type of 2 different PyTorch files \ No newline at end of file +* [identify_pytorch_file.py](https://github.com/trailofbits/fickling/blob/master/example/identify_pytorch_file.py): Identify 2 PyTorch files that are different formats \ No newline at end of file diff --git a/safe.pkl b/safe.pkl deleted file mode 100644 index 1c6c4ccab0cee0d58b978c1952d2b9bccf2213a3..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 22 YcmZo*naa%o0kKmwycxZjyqQz=04ZhzxBvhE diff --git a/unsafe.pickle b/unsafe.pickle deleted file mode 100644 index f9e7047eb188dcde3a9ae636b99f90bed7ebd8bb..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 83 zcmZo*nd-p+0X?h*`Nf$PQ+n8nD~n4~bEiz6(xaQ2oRP1f?x|3mkzbmVqL7hTma34H inwqCjT#{ccLYaI0e From 1763ddc7422f410bcc87a5bdcf4268e88847b50a Mon Sep 17 00:00:00 2001 From: Suha Sabi Hussain Date: Thu, 25 Jan 2024 08:44:30 -0500 Subject: [PATCH 11/16] Lint and tidy files --- README.md | 2 +- example/README.md | 4 ++-- example/identify_pytorch_file.py | 8 +++++--- fickling/polyglot.py | 31 +++++++++++++++++++++---------- fickling/pytorch.py | 23 ++++++++--------------- test/test_polyglot.py | 10 +++++++--- test/test_pytorch.py | 15 +-------------- 7 files changed, 45 insertions(+), 48 deletions(-) diff --git a/README.md b/README.md index 8cc46bb..2a5fa41 100644 --- a/README.md +++ b/README.md @@ -172,7 +172,7 @@ following PyTorch file formats: * **TorchScript v1.3**: ZIP file with data.pkl and constants.pkl (2 pickle files) * **TorchScript v1.4**: ZIP file with data.pkl, constants.pkl, and version (2 pickle files and a folder) * **PyTorch v1.3**: ZIP file containing data.pkl (1 pickle file) -* **PyTorch model archive format**: ZIP file that includes Python code files and pickle files +* **PyTorch model archive format[ZIP]**: ZIP file that includes Python code files and pickle files ```python >> import torch diff --git a/example/README.md b/example/README.md index b87f47c..aee4390 100644 --- a/example/README.md +++ b/example/README.md @@ -4,7 +4,7 @@ * [context_manager.py](https://github.com/trailofbits/fickling/blob/master/example/context_manager.py): Halt the deserialization of a malicious pickle file with the fickling context manager * [fault_injection.py](https://github.com/trailofbits/fickling/blob/master/example/fault_injection.py): Perform a fault injection on a PyTorch model and then analyze the result with `check_safety` * [inject_mobilenet.py](https://github.com/trailofbits/fickling/blob/master/example/inject_mobilenet.py): Override the `eval` method of a ML model using fickling and apply `fickling.is_likely_safe` to the model file -* [inject_pytorch.py](https://github.com/trailofbits/fickling/blob/master/example/inject_pytorch.py): Inject a model loaded from a PyTorch file with malicious code using fickling’s PyTorch module +* [inject_pytorch.py](https://github.com/trailofbits/fickling/blob/master/example/inject_pytorch.py): Inject a model loaded from a PyTorch file with malicious code using the PyTorch module * [numpy_poc.py](https://github.com/trailofbits/fickling/blob/master/example/numpy_poc.py): Analyze a malicious payload passed to `numpy.load()` * [trace_binary.py](https://github.com/trailofbits/fickling/blob/master/example/trace_binary.py): Decompile a payload using the tracing module -* [identify_pytorch_file.py](https://github.com/trailofbits/fickling/blob/master/example/identify_pytorch_file.py): Determine the file format type of 2 different PyTorch files \ No newline at end of file +* [identify_pytorch_file.py](https://github.com/trailofbits/fickling/blob/master/example/identify_pytorch_file.py): Identify 2 PyTorch files that are different formats \ No newline at end of file diff --git a/example/identify_pytorch_file.py b/example/identify_pytorch_file.py index 924e09f..bba42b8 100644 --- a/example/identify_pytorch_file.py +++ b/example/identify_pytorch_file.py @@ -5,10 +5,12 @@ model = models.mobilenet_v2() torch.save(model, "mobilenet.pth") +torch.save(model, "legacy_mobilenet.pth", _use_new_zipfile_serialization=False) -print("Identifying PyTorch 1.3 file:") +print("Identifying PyTorch v1.3 file:") potential_formats = polyglot.identify_pytorch_file_format("mobilenet.pth", print_results=True) -torch.save(model, "legacy_mobilenet.pth", _use_new_zipfile_serialization=False) print("Identifying PyTorch v0.1.10 file:") -potential_formats_legacy = polyglot.identify_pytorch_file_format("legacy_mobilenet.pth", print_results=True) +potential_formats_legacy = polyglot.identify_pytorch_file_format( + "legacy_mobilenet.pth", print_results=True +) diff --git a/fickling/polyglot.py b/fickling/polyglot.py index 5467205..2696deb 100644 --- a/fickling/polyglot.py +++ b/fickling/polyglot.py @@ -32,7 +32,7 @@ def check_and_find_in_zip( - zip_path, file_name_or_extension, return_path=False, check_extension=False + zip_path, file_name_or_extension, return_path=False, check_extension=False ): """Check for a file in the zip and return its path or boolean if found.""" try: @@ -165,9 +165,9 @@ def check_for_corruption(properties): # We expect this to be expanded upon if properties["is_torch_zip"]: if ( - properties["has_model_json"] - and not properties["has_attribute_pkl"] - and not properties["has_constants_pkl"] + properties["has_model_json"] + and not properties["has_attribute_pkl"] + and not properties["has_constants_pkl"] ): corrupted = True reason = """Your file may be corrupted. It contained a @@ -222,7 +222,9 @@ def identify_pytorch_file_format(file, print_properties=False, print_results=Fal secondary = formats[1:] if len(secondary) != 0: if print_results: - print("It is also possible that your file can be validly interpreted as: ", secondary) + print( + "It is also possible that your file can be validly interpreted as: ", secondary + ) else: if print_results: print( @@ -245,7 +247,9 @@ def create_zip_pickle_polyglot(zip_file, pickle_file): append_file(zip_file, pickle_file) -def create_mar_legacy_pickle_polyglot(files, print_results=False, polyglot_file_name="polyglot.mar.pt"): +def create_mar_legacy_pickle_polyglot( + files, print_results=False, polyglot_file_name="polyglot.mar.pt" +): files.sort(key=lambda x: x[1] != "PyTorch model archive format") if print_results: print("Making a PyTorch MAR/PyTorch v0.1.10 polyglot") @@ -255,7 +259,9 @@ def create_mar_legacy_pickle_polyglot(files, print_results=False, polyglot_file_ return polyglot_found -def create_standard_torchscript_polyglot(files, print_results=False, polyglot_file_name="polyglot.pt"): +def create_standard_torchscript_polyglot( + files, print_results=False, polyglot_file_name="polyglot.pt" +): if print_results: print("Making a PyTorch v1.3/TorchScript v1.4 polyglot") print("Warning: For some parsers, this may generate polymocks instead of polyglots.") @@ -282,9 +288,12 @@ def create_standard_torchscript_polyglot(files, print_results=False, polyglot_fi polyglot_found = True return polyglot_found -def create_mar_legacy_tar_polyglot(files, print_results=False, polyglot_file_name="polyglot.mar.tar"): + +def create_mar_legacy_tar_polyglot( + files, print_results=False, polyglot_file_name="polyglot.mar.tar" +): if print_results: - print("Making a PyTorch v0.1.1/PyTorch MAR polyglot") + print("Making a PyTorch v0.1.1/PyTorch MAR polyglot") mar_file = [file[0] for file in files if file[1] == "PyTorch model archive format"][0] tar_file = [file[0] for file in files if file[1] == "PyTorch v0.1.1"][0] polyglot_file = append_file(mar_file, tar_file) @@ -311,7 +320,9 @@ def create_polyglot(first_file, second_file, polyglot_file_name=None, print_resu if {"PyTorch v1.3", "TorchScript v1.4"}.issubset(formats): if polyglot_file_name is None: polyglot_file_name = "polyglot.pt" - polyglot_found = create_standard_torchscript_polyglot(files, print_results, polyglot_file_name) + polyglot_found = create_standard_torchscript_polyglot( + files, print_results, polyglot_file_name + ) if {"PyTorch model archive format", "PyTorch v0.1.1"}.issubset(formats): if polyglot_file_name is None: polyglot_file_name = "polyglot.mar.tar" diff --git a/fickling/pytorch.py b/fickling/pytorch.py index f91a690..d8f77f2 100644 --- a/fickling/pytorch.py +++ b/fickling/pytorch.py @@ -9,6 +9,7 @@ import fickling.polyglot from fickling.fickle import Pickled + class BaseInjection(torch.nn.Module): # This class allows you to combine the payload and original model def __init__(self, original_model: torch.nn.Module, payload: str): @@ -32,12 +33,12 @@ def __init__(self, path: Path, force: bool = False): def validate_file_format(self): self._formats = fickling.polyglot.identify_pytorch_file_format(self.path) - """ + """ PyTorch v1.3 and TorchScript v1.4 are explicitly supported by PyTorchModelWrapper. This class may work on other file formats depending on its construction. - To enable users to check that and load polyglots, the force argument exists. - There is a warning for TorchScript v1.4 because of the scripting/tracing/mixing edge cases. - Technically, injections applied to that format should not work torch.jit.load() but may work on torch.load(). + To enable users to check that and load polyglots, the force argument exists. + There is a warning for TorchScript v1.4 because of the scripting/tracing/mixing edge cases. + For example, an injection may work on torch.load() but not torch.jit.load(). """ if len(self._formats) == 0: if self.force is True: @@ -55,13 +56,6 @@ def validate_file_format(self): If it is a PyTorch file, raise an issue on GitHub. """ ) - # if (("PyTorch v1.3" not in self._formats) or - # { - # "TorchScript v1.4", - # "TorchScript v1.3", - # "TorchScript v1.1", - # "TorchScript v1.0", - # }.intersection(self._formats)): if ("PyTorch v1.3" not in self._formats) and ("TorchScript v1.4" not in self._formats): if "PyTorch v0.1.10" in self._formats: if self.force is True: @@ -94,8 +88,7 @@ def validate_file_format(self): ) if self._formats[0] == "TorchScript v1.4": warnings.warn( - """Support for TorchScript v1.4 files is experimental. The effectiveness of the capabilities of - PyTorchModelWrapper, especially injection, is highly dependent on the construction of the model.""", + """Support for TorchScript v1.4 files is experimental.""", UserWarning, ) return self._formats @@ -126,8 +119,8 @@ def inject_payload( self.output_path = output_path if self.formats[0] == "TorchScript v1.4": warnings.warn( - """Support for TorchScript v1.4 files is experimental. Injections may not be effective - depending on the construction of the model and the target parser.""", + """Support for TorchScript v1.4 files is experimental. + Injections may not be effective depending on the model and the target parser.""", UserWarning, ) if injection == "insertion": diff --git a/test/test_polyglot.py b/test/test_polyglot.py index 9deb535..f0658fb 100644 --- a/test/test_polyglot.py +++ b/test/test_polyglot.py @@ -89,7 +89,7 @@ def tearDown(self): self.zip_filename, self.filename_torchscript_dup, self.filename_v1_3_dup, - self.standard_torchscript_polyglot_name + self.standard_torchscript_polyglot_name, ]: if os.path.exists(filename): os.remove(filename) @@ -181,7 +181,11 @@ def test_zip_properties(self): self.assertEqual(properties, proper_result) def test_create_standard_torchscript_polyglot(self): - polyglot.create_polyglot(self.filename_v1_3_dup, self.filename_torchscript_dup, self.standard_torchscript_polyglot_name, print_results=False) + polyglot.create_polyglot( + self.filename_v1_3_dup, + self.filename_torchscript_dup, + self.standard_torchscript_polyglot_name, + print_results=False, + ) formats = polyglot.identify_pytorch_file_format(self.standard_torchscript_polyglot_name) self.assertTrue({"PyTorch v1.3", "TorchScript v1.4"}.issubset(formats)) - diff --git a/test/test_pytorch.py b/test/test_pytorch.py index 181197c..fcb8746 100644 --- a/test/test_pytorch.py +++ b/test/test_pytorch.py @@ -28,6 +28,7 @@ def test_wrapper(self): PyTorchModelWrapper(self.filename_v1_3) except Exception as e: # noqa self.fail(f"PyTorchModelWrapper was not able to load a PyTorch v1.3 file: {e}") + def test_torchscript_wrapper(self): try: PyTorchModelWrapper(self.torchscript_filename) @@ -44,7 +45,6 @@ def test_torchscript_pickled(self): pickled_portion = result.pickled self.assertIsInstance(pickled_portion, Pickled) - def test_injection_insertion(self): try: result = PyTorchModelWrapper(self.filename_v1_3) @@ -57,19 +57,6 @@ def test_injection_insertion(self): f"PyTorchModelWrapper was not able to inject code into a PyTorch v1.3 file: {e}" ) - def test_injection_insertion(self): - try: - result = PyTorchModelWrapper(self.filename_v1_3) - temp_filename = "temp_filename" - result.inject_payload("print('Hello, World!')", temp_filename, injection="insertion") - if os.path.exists(temp_filename): - os.remove(temp_filename) - except Exception as e: # noqa - self.fail( - f"PyTorchModelWrapper was not able to inject code into a PyTorch v1.3 file: {e}" - ) - - def test_injection_combination(self): try: result = PyTorchModelWrapper(self.filename_v1_3) From eb63b0bc29e6898143dd5dc04e43f0844fdbb169 Mon Sep 17 00:00:00 2001 From: Suha Sabi Hussain Date: Thu, 25 Jan 2024 09:12:57 -0500 Subject: [PATCH 12/16] Delete extraneous information --- fickling/polyglot.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/fickling/polyglot.py b/fickling/polyglot.py index 2696deb..de4b9b3 100644 --- a/fickling/polyglot.py +++ b/fickling/polyglot.py @@ -21,9 +21,6 @@ • PyTorch v1.3: ZIP file containing data.pkl (1 pickle file) • PyTorch model archive format[ZIP]: ZIP file that includes Python code files and pickle files -Officially, PyTorch v0.1.1 and TorchScript < v1.4 are deprecated. -However, they are still supported by some legacy parsers - This description draws from this PyTorch GitHub issue: https://github.com/pytorch/pytorch/issues/31877. If any inaccuracies in that description are found, that should be reflected in this code. If any new PyTorch file formats are made, that should be added to this code. From 5231abae311be16aef044e11e7573947f08bdd41 Mon Sep 17 00:00:00 2001 From: Suha Sabi Hussain Date: Thu, 25 Jan 2024 11:14:48 -0500 Subject: [PATCH 13/16] Bump version number for release --- fickling/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fickling/__init__.py b/fickling/__init__.py index 9d14d28..ac4dc0e 100644 --- a/fickling/__init__.py +++ b/fickling/__init__.py @@ -7,4 +7,4 @@ # The above lines enables `fickling.load()` and `with fickling.check_safety()` # The comments are necessary to comply with linters -__version__ = "0.0.8" +__version__ = "0.1.0" From f97fbd48d6943f3d1661ce0a55fdbf27b5578726 Mon Sep 17 00:00:00 2001 From: Suha Sabi Hussain Date: Fri, 26 Jan 2024 13:59:00 -0500 Subject: [PATCH 14/16] Clarify what polyglot files are --- README.md | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 2a5fa41..f648025 100644 --- a/README.md +++ b/README.md @@ -162,7 +162,9 @@ Module( ### PyTorch polyglots -We currently support inspecting, identifying, and creating file polyglots between the +PyTorch contains multiple file formats with which one can make polyglot files, which +are files that can be validly interpreted as more than one file format. +Fickling supports identifying, inspecting, and creating polyglots with the following PyTorch file formats: * **PyTorch v0.1.1**: Tar file with sys_info, pickle, storages, and tensors From 56dadf38dc9aad1741e3e761eca91393318de67c Mon Sep 17 00:00:00 2001 From: Suha Sabi Hussain Date: Fri, 26 Jan 2024 14:34:18 -0500 Subject: [PATCH 15/16] More lint --- example/fault_injection.py | 1 + fickling/analysis.py | 18 ++++++++++-------- fickling/fickle.py | 6 ++---- 3 files changed, 13 insertions(+), 12 deletions(-) diff --git a/example/fault_injection.py b/example/fault_injection.py index 9c3f871..feb9de9 100644 --- a/example/fault_injection.py +++ b/example/fault_injection.py @@ -4,6 +4,7 @@ Note you may need to run `pip install pytorchfi` """ + import pickle import torch diff --git a/fickling/analysis.py b/fickling/analysis.py index 8c9ddfc..2cbb40d 100644 --- a/fickling/analysis.py +++ b/fickling/analysis.py @@ -107,9 +107,9 @@ def __init__( self.severity: Severity = severity self.message: Optional[str] = message self.analysis_name: str = analysis_name - self.trigger: Optional[ - str - ] = trigger # Field to store the trigger code fragment or artifact + self.trigger: Optional[str] = ( + trigger # Field to store the trigger code fragment or artifact + ) def __lt__(self, other): return isinstance(other, AnalysisResult) and ( @@ -296,11 +296,13 @@ def to_dict(self, verbosity: Severity = Severity.SUSPICIOUS): analysis_message = self.to_string(verbosity) severity_data = { "severity": self.severity.name, - "analysis": analysis_message - if analysis_message.strip() - else "Warning: Fickling failed to detect any overtly unsafe code, but the pickle file" - "may still be unsafe." - "Do not unpickle this file if it is from an untrusted source!\n\n", + "analysis": ( + analysis_message + if analysis_message.strip() + else "Warning: Fickling failed to detect any overtly unsafe code, but the pickle file" + "may still be unsafe." + "Do not unpickle this file if it is from an untrusted source!\n\n" + ), "detailed_results": self.detailed_results(), } return severity_data diff --git a/fickling/fickle.py b/fickling/fickle.py index b1d0e00..ac7c7a0 100644 --- a/fickling/fickle.py +++ b/fickling/fickle.py @@ -760,13 +760,11 @@ def __init__(self, initial_value: Iterable[T] = ()): @overload @abstractmethod - def __getitem__(self, i: int) -> T: - ... + def __getitem__(self, i: int) -> T: ... @overload @abstractmethod - def __getitem__(self, s: slice) -> GenericSequence: - ... + def __getitem__(self, s: slice) -> GenericSequence: ... def __getitem__(self, i: int) -> T: return self._stack[i] From 2188f255840850505be18c5c944c5c04749cbc22 Mon Sep 17 00:00:00 2001 From: Suha Sabi Hussain Date: Fri, 26 Jan 2024 14:38:05 -0500 Subject: [PATCH 16/16] More lint --- fickling/analysis.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fickling/analysis.py b/fickling/analysis.py index 2cbb40d..1fa41a1 100644 --- a/fickling/analysis.py +++ b/fickling/analysis.py @@ -299,8 +299,8 @@ def to_dict(self, verbosity: Severity = Severity.SUSPICIOUS): "analysis": ( analysis_message if analysis_message.strip() - else "Warning: Fickling failed to detect any overtly unsafe code, but the pickle file" - "may still be unsafe." + else "Warning: Fickling failed to detect any overtly unsafe code," + "but the pickle file may still be unsafe." "Do not unpickle this file if it is from an untrusted source!\n\n" ), "detailed_results": self.detailed_results(),