Add "artifact_type" to Artifact dumping format (#137)

* Add "artifact_type" to Artifact dumping format * remove dead code
binsync · Nov 24, 2024 · 067dac5 · 067dac5
1 parent 669215d
commit 067dac5
Show file tree

Hide file tree

Showing 4 changed files with 95 additions and 4 deletions.
diff --git a/libbs/__init__.py b/libbs/__init__.py
@@ -1,4 +1,4 @@
-__version__ = "2.7.0"
+__version__ = "2.8.0"
 
 
 import logging

diff --git a/libbs/artifacts/__init__.py b/libbs/artifacts/__init__.py
@@ -1,3 +1,7 @@
+import json
+
+import toml
+
 from .formatting import TomlHexEncoder, ArtifactFormat
 from .artifact import Artifact
 from .comment import Comment
@@ -26,3 +30,60 @@
     Context.__name__: Context,
     Typedef.__name__: Typedef,
 }
+
+
+def _dict_from_str(art_str: str, fmt=ArtifactFormat.TOML) -> dict:
+    if fmt == ArtifactFormat.TOML:
+        return toml.loads(art_str)
+    elif fmt == ArtifactFormat.JSON:
+        return json.loads(art_str)
+    else:
+        raise ValueError(f"Loading from format {fmt} is not yet supported.")
+
+
+def _art_from_dict(art_dict: dict) -> Artifact:
+    art_type_str = art_dict.get(Artifact.ART_TYPE_STR, None)
+    if art_type_str is None:
+        raise ValueError(f"Artifact type string not found in artifact data: {art_dict}. Is this a valid artifact?")
+
+    art_cls = ART_NAME_TO_CLS[art_type_str]
+    art = art_cls()
+    art.__setstate__(art_dict)
+    return art
+
+
+def _load_arts_from_list(art_strs: list[str], fmt=ArtifactFormat.TOML) -> list[Artifact]:
+    arts = []
+    for art_str in art_strs:
+        data_dict = _dict_from_str(art_str, fmt=fmt)
+        art = _art_from_dict(data_dict)
+        arts.append(art)
+    return arts
+
+
+def _load_arts_from_string(art_str: str, fmt=ArtifactFormat.TOML) -> list[Artifact]:
+    data_dict = _dict_from_str(art_str, fmt=fmt)
+    if isinstance(data_dict, dict):
+        data_dicts = list(data_dict.values())
+    elif isinstance(data_dict, list):
+        data_dicts = data_dict
+    else:
+        raise ValueError(f"Unexpected data type: {type(data_dict)}")
+
+    arts = []
+    for v in data_dicts:
+        art = _art_from_dict(v)
+        arts.append(art)
+
+    return arts
+
+
+def load_many_artifacts(art_strings: list[str], fmt=ArtifactFormat.TOML) -> list[Artifact]:
+    """
+    A helper function to load many dumped artifacts from a list of strings. Each string should have been dumped
+    using the `dumps` method of an artifact.
+
+    :param art_strings: A list of strings or a single string containing multiple dumped artifacts.
+    :param fmt: The format of the dumped artifacts.
+    """
+    return _load_arts_from_list(art_strings, fmt=fmt)
diff --git a/libbs/artifacts/artifact.py b/libbs/artifacts/artifact.py
@@ -11,10 +11,12 @@ class Artifact:
     """
     The Artifact class acts as the base for all other artifacts that can be produced by a decompiler (or decompiler
     adjacent tool). In general, the comparisons of these derived classes should only be done on the attributes in
-    __slots__, with the exception of the last_change property.
+    __slots__, except for the last_change property.
     """
     LST_CHNG_ATTR = "last_change"
     ADDR_ATTR = "addr"
+    ART_TYPE_STR = "artifact_type"
+
     ATTR_ATTR_IGNORE_SET = "_attr_ignore_set"
     __slots__ = (
         LST_CHNG_ATTR,
@@ -99,6 +101,8 @@ def _from_c_string(cls, cstring) -> Dict:
 
     def dumps(self, fmt=ArtifactFormat.TOML) -> str:
         dict_data = self.__getstate__()
+        # encode the artifact type
+        dict_data.update({self.ART_TYPE_STR: self.__class__.__name__})
         if fmt == ArtifactFormat.TOML:
             return toml.dumps(dict_data, encoder=TomlHexEncoder())
         elif fmt == ArtifactFormat.JSON:
@@ -123,6 +127,8 @@ def loads(cls, string, fmt=ArtifactFormat.TOML) -> "Artifact":
         else:
             raise ValueError(f"Loading from format {fmt} is not yet supported.")
 
+        # remove the artifact type (if it exists)
+        dict_data.pop(Artifact.ART_TYPE_STR, None)
         art = cls()
         art.__setstate__(dict_data)
         return art
@@ -150,7 +156,7 @@ def dumps_many(cls, artifacts: List["Artifact"], key_attr=ADDR_ATTR, fmt=Artifac
             raise ValueError(f"Dumping many to format {fmt} is not yet supported.")
 
     @classmethod
-    def loads_many(cls, string, fmt=ArtifactFormat.TOML) -> List["Artifact"]:
+    def loads_many(cls, string: str, fmt=ArtifactFormat.TOML) -> List["Artifact"]:
         if fmt == ArtifactFormat.TOML:
             dict_data = toml.loads(string)
         elif fmt == ArtifactFormat.JSON:

diff --git a/tests/test_artifacts.py b/tests/test_artifacts.py
@@ -5,7 +5,8 @@
 
 import toml
 from libbs.artifacts import (
-    FunctionHeader, StackVariable, FunctionArgument, Function, ArtifactFormat, Struct, StructMember
+    FunctionHeader, StackVariable, FunctionArgument, Function, ArtifactFormat, Struct, StructMember,
+    load_many_artifacts, Artifact
 )
 
 
@@ -170,6 +171,29 @@ def test_serialization(self):
             loaded_struct = Struct.loads(serialized_struct, fmt=fmt)
             assert loaded_struct == struct
 
+    def test_many_deserialization(self):
+        func, _ = generate_test_funcs(0x400000)
+        struct = Struct(name="some_struct", size=8, members={
+            0: StructMember(offset=0, name="m0", type_="int", size=4),
+            4: StructMember(offset=4, name="m4", type_="long", size=8)
+        })
+
+        # test loading many in a list of strings
+        data_strs = [func.dumps(fmt=ArtifactFormat.JSON), struct.dumps(fmt=ArtifactFormat.JSON)]
+        for data_str in data_strs:
+            data_dict = json.loads(data_str)
+            # the ART_TYPE_STR should be in the data to tell you what type of artifact it is
+            assert Artifact.ART_TYPE_STR in data_dict
+
+        loaded_arts = load_many_artifacts(data_strs, fmt=ArtifactFormat.JSON)
+        assert len(loaded_arts) == 2
+
+        loaded_func = loaded_arts[0]
+        assert loaded_func == func
+
+        loaded_struct = loaded_arts[1]
+        assert loaded_struct == struct
+
 
 if __name__ == "__main__":
     unittest.main(argv=sys.argv)