From 0bbbc393c2f94977a5bb0e5cd45c7fa58818cbb7 Mon Sep 17 00:00:00 2001 From: Riccardo Mori Date: Mon, 12 Feb 2024 16:30:20 +0100 Subject: [PATCH 01/20] Add csv exporter. Improve abstraction layer over generic graphs --- src/qbindiff/__init__.py | 2 +- src/qbindiff/abstract.py | 58 ++++++++++--- src/qbindiff/loader/function.py | 33 +++++--- src/qbindiff/loader/program.py | 90 +++++++++++--------- src/qbindiff/mapping/mapping.py | 146 +++++++++++++++++++++++--------- src/qbindiff/types.py | 22 ++--- 6 files changed, 231 insertions(+), 120 deletions(-) diff --git a/src/qbindiff/__init__.py b/src/qbindiff/__init__.py index 880cfbc..fd2811d 100644 --- a/src/qbindiff/__init__.py +++ b/src/qbindiff/__init__.py @@ -50,7 +50,7 @@ """ from qbindiff.version import __version__ -from qbindiff.abstract import GenericGraph +from qbindiff.abstract import GenericGraph, GenericNode from qbindiff.differ import QBinDiff, DiGraphDiffer, GraphDiffer, Differ from qbindiff.mapping import Mapping from qbindiff.loader import Program, Function diff --git a/src/qbindiff/abstract.py b/src/qbindiff/abstract.py index 0f9a673..34a1f04 100644 --- a/src/qbindiff/abstract.py +++ b/src/qbindiff/abstract.py @@ -18,9 +18,30 @@ throught the qbindiff module (the differ, the matcher, the exporters, etc...). """ +from __future__ import annotations from abc import ABCMeta, abstractmethod -from collections.abc import Iterator -from typing import Any +from collections.abc import Hashable +from typing import TYPE_CHECKING + +if TYPE_CHECKING: + from collections.abc import Iterable + from typing import Any + from qbindiff.types import NodeLabel + + +class GenericNode(Hashable): + """ + Abstract class representing a generic node + """ + + @abstractmethod + def get_label(self) -> NodeLabel: + """ + Get the label associated to this node + + :returns: The node label associated with this node + """ + raise NotImplementedError() class GenericGraph(metaclass=ABCMeta): @@ -34,40 +55,51 @@ def __len__(self) -> int: raise NotImplementedError() @abstractmethod - def items(self) -> Iterator[tuple[Any, Any]]: + def items(self) -> Iterable[tuple[NodeLabel, GenericNode]]: """ - Return an iterator over the items. Each item is {node_label: node} + Iterate over the items. Each item is {node_label: node} + + :returns: A :py:class:`Iterable` over the items. Each item is + a tuple (node_label, node) """ raise NotImplementedError() @abstractmethod - def get_node(self, node_label: Any): + def get_node(self, node_label: NodeLabel) -> GenericNode: """ - Returns the node identified by the `node_label` + Get the node identified by the `node_label` + + :param node_label: the unique identifier of the node + :returns: The node identified by the label """ raise NotImplementedError() @property @abstractmethod - def node_labels(self) -> Iterator[Any]: + def node_labels(self) -> Iterable[NodeLabel]: """ - Return an iterator over the node labels + Iterate over the node labels + + :returns: An :py:class:`Iterable` over the node labels """ raise NotImplementedError() @property @abstractmethod - def nodes(self) -> Iterator[Any]: + def nodes(self) -> Iterable[GenericNode]: """ - Return an iterator over the nodes + Iterate over the nodes themselves + + :returns: An :py:class:`Iterable` over the nodes """ raise NotImplementedError() @property @abstractmethod - def edges(self) -> Iterator[tuple[Any, Any]]: + def edges(self) -> Iterable[tuple[NodeLabel, NodeLabel]]: """ - Return an iterator over the edges. - An edge is a pair (node_label_a, node_label_b) + Iterate over the edges. An edge is a pair (node_label_a, node_label_b) + + :returns: An :py:class`Iterable` over the edges. """ raise NotImplementedError() diff --git a/src/qbindiff/loader/function.py b/src/qbindiff/loader/function.py index e4dd188..079a388 100644 --- a/src/qbindiff/loader/function.py +++ b/src/qbindiff/loader/function.py @@ -16,20 +16,24 @@ """ from __future__ import annotations -import networkx -from collections.abc import Mapping, Generator +from typing import TYPE_CHECKING +from qbindiff.abstract import GenericNode from qbindiff.loader import BasicBlock from qbindiff.loader.types import FunctionType -from qbindiff.types import Addr -from qbindiff.loader.backend.abstract import AbstractFunctionBackend +if TYPE_CHECKING: + import networkx + from collections.abc import Mapping, Generator + from qbindiff.loader.backend.abstract import AbstractFunctionBackend + from qbindiff.types import Addr -class Function(Mapping[Addr, BasicBlock]): + +class Function(Mapping[Addr, BasicBlock], GenericNode): """ Representation of a binary function. - This class is a dict of basic block addreses to the basic block. + This class is a non-mutable mapping between basic block's address and the basic block itself. It lazily loads all the basic blocks when iterating through them or even accessing one of them and it unloads all of them after the iteration has ended. @@ -50,7 +54,7 @@ class Function(Mapping[Addr, BasicBlock]): """ def __init__(self, backend: AbstractFunctionBackend): - super(Function, self).__init__() + super().__init__() # The basic blocks are lazily loaded self._basic_blocks = None @@ -94,7 +98,7 @@ def __getitem__(self, key: Addr) -> BasicBlock: self._unload() return bb - def __iter__(self) -> Generator[BasicBlock]: + def __iter__(self) -> Generator[BasicBlock, None, None]: """ Iterate over basic blocks, not addresses """ @@ -115,11 +119,11 @@ def __len__(self) -> int: self._unload() return size - def items(self) -> Generator[Addr, BasicBlock]: + def items(self) -> Generator[tuple[Addr, BasicBlock], None, None]: """ Returns a generator of tuples with addresses of basic blocks and the corresponding basic blocks objects - :return: generator (addr, basicblock) + :returns: generator (addr, basicblock) """ if self._basic_blocks is not None: @@ -151,6 +155,15 @@ def _unload(self) -> None: self._basic_blocks = None self._backend.unload_blocks() + def get_label(self) -> Addr: + """ + Get the address associated to this function + + :returns: The address associated with the function + """ + + return self.addr + @property def edges(self) -> list[tuple[Addr, Addr]]: """ diff --git a/src/qbindiff/loader/program.py b/src/qbindiff/loader/program.py index a542fe7..c5841d8 100644 --- a/src/qbindiff/loader/program.py +++ b/src/qbindiff/loader/program.py @@ -16,28 +16,33 @@ """ from __future__ import annotations -import networkx -from collections.abc import Callable, Iterator +from typing import TYPE_CHECKING from qbindiff.abstract import GenericGraph -from qbindiff.loader import Function, Structure +from qbindiff.loader import Function from qbindiff.loader.types import LoaderType -from qbindiff.types import Addr -from qbindiff.loader.backend.abstract import AbstractProgramBackend +if TYPE_CHECKING: + import networkx + from networkx.classes.reportviews import OutEdgeView + from collections.abc import Callable, Iterator + from qbindiff.loader import Structure + from qbindiff.loader.backend.abstract import AbstractProgramBackend + from qbindiff.types import Addr -class Program(dict, GenericGraph): + +class Program(MutableMapping[Addr, Function], GenericGraph): """ Program class that shadows the underlying program backend used. - It inherits from dict which keys are function addresses and - values are Function object. + It is a :py:class:`MutableMapping`, where keys are function addresses and + values are :py:class:`Function` objects. - The node label is the function address, the node itself is the Function object + The node label is the function address, the node itself is the :py:class:`Function` object """ def __init__(self, loader: LoaderType | None, /, *args, **kwargs): - super(Program, self).__init__() + super().__init__() self._backend = None if loader is None and (backend := kwargs.get("backend")) is not None: @@ -62,6 +67,7 @@ def __init__(self, loader: LoaderType | None, /, *args, **kwargs): raise NotImplementedError("Loader: %s not implemented" % loader) self._filter = lambda x: True + self._functions: dict[Addr, Function] = {} # underlying dictionary containing the functions self._load_functions() @staticmethod @@ -110,45 +116,50 @@ def from_backend(backend: AbstractProgramBackend) -> Program: def __repr__(self) -> str: return "" % self.name - def __iter__(self): + def __iter__(self) -> Iterator[Addr]: """ - Override the built-in __iter__ to iterate all functions - located in the program. + Iterate over all functions' address located in the program. - :return: Iterator of all functions (sorted by address) + :return: Iterator of all functions' address """ - for addr in sorted(self.keys()): - if self._filter(addr): # yield function only if filter agree to keep it - yield self[addr] + yield from self.node_labels - def _load_functions(self) -> None: - """ - Load the functions from the backend + def __len__(self) -> int: + return len(self._functions) - :return: None - """ + def __getitem__(self, key): + return self._functions.__getitem__(key) + + def __setitem__(self, key, value): + self._functions.__setitem__(key) + + def __delitem__(self, key): + self._functions.__delitem__(key) + + def _load_functions(self) -> None: + """Load the functions from the backend""" for function in map(Function.from_backend, self._backend.functions): self[function.addr] = function def items(self) -> Iterator[tuple[Addr, Function]]: """ - Return an iterator over the items. Each item is {node_label: node} + Iterate over the items. Each item is {address: :py:class:`Function`} - :return: an iterator over the program elements. Each element is a tuple of shape (function_addr, function_obj) + :returns: A :py:class:`Iterator` over the functions. Each element + is a tuple (function_addr, function_obj) """ - for addr in self.keys(): - if self._filter(addr): # yield function only if filter agree to keep it - yield (addr, self[addr]) + # yield function only if filter agree to keep it + yield from (lambda i: self._filter(i[0]), self._functions.items()) def get_node(self, node_label: Addr) -> Function: """ - Returns the node identified by the `node_label` + Get the function identified by the address :paramref:`node_label` - :param node_label: the node_label or the address from which we want to recover the object - :return: the function identified by its address + :param node_label: the address of the function that will be returned + :returns: the function identified by its address """ return self[node_label] @@ -156,26 +167,29 @@ def get_node(self, node_label: Addr) -> Function: @property def node_labels(self) -> Iterator[Addr]: """ - Iterator over the node labels + Iterate over the functions' address + + :returns: An :py:class:`Iterator` over the functions' address """ - for addr in self.keys(): - if self._filter(addr): - yield addr + yield from filter(self._filter, self.keys()) @property def nodes(self) -> Iterator[Function]: """ - Iterator over the nodes + Iterate over the functions + + :returns: An :py:class:`Iterator` over the functions """ yield from self.__iter__() @property - def edges(self) -> Iterator[tuple[Addr, Addr]]: + def edges(self) -> OutEdgeView[Addr, Addr]: """ - Iterator over the edges. - An edge is a pair (addr_a, addr_b) + Iterate over the edges. An edge is a pair (addr_a, addr_b) + + :returns: An :py:class`OutEdgeView` over the edges. """ return self.callgraph.edges diff --git a/src/qbindiff/mapping/mapping.py b/src/qbindiff/mapping/mapping.py index a66aa02..87241e6 100644 --- a/src/qbindiff/mapping/mapping.py +++ b/src/qbindiff/mapping/mapping.py @@ -15,17 +15,26 @@ """Simple mapping interface """ -from qbindiff.types import Match, ExtendedMapping, Item +from __future__ import annotations +import csv +from typing import TYPE_CHECKING + +from qbindiff.types import Match + +if TYPE_CHECKING: + from pathlib import Path + from typing import Callable + from qbindiff.types import ExtendedMapping, Node class Mapping: """ This class represents an interface to access the result of the matching analysis. - Its interface is independent of the underlying objects / items manipulated. + Its interface is independent of the underlying :py:obj:`Node`s manipulated. """ def __init__( - self, mapping: ExtendedMapping, unmatched_primary: set[Item], unmatched_secondary: set[Item] + self, mapping: ExtendedMapping, unmatched_primary: set[Node], unmatched_secondary: set[Node] ): self._matches = [Match(*x) for x in mapping] self._primary_unmatched = unmatched_primary @@ -46,7 +55,7 @@ def normalized_similarity(self) -> float: """ Normalized similarity of the diff (from 0 to 1) """ - return (2 * self.similarity) / (self.nb_item_primary + self.nb_item_secondary) + return (2 * self.similarity) / (self.nb_node_primary + self.nb_node_secondary) @property def squares(self) -> float: @@ -57,23 +66,23 @@ def squares(self) -> float: def add_match( self, - item1: Item, - item2: Item, + node1: Node, + node2: Node, similarity: float = None, confidence: float = 0.0, squares: int = None, ) -> None: """ - Add the given match between the two items. + Add the given match between the two nodes. - :param item1: function address in primary - :param item2: function address in secondary + :param node1: node in primary + :param node2: node in secondary :param similarity: similarity metric as float :param confidence: confidence in the result (0..1) :param squares: Number of squares being made :return: None """ - self._matches.append(Match(item1, item2, similarity, confidence, squares)) + self._matches.append(Match(node1, node2, similarity, confidence, squares)) def remove_match(self, match: Match) -> None: """ @@ -85,30 +94,30 @@ def remove_match(self, match: Match) -> None: self._matches.remove(match) @property - def primary_matched(self) -> set[Item]: + def primary_matched(self) -> set[Node]: """ - Set of items matched in primary + Set of nodes matched in primary """ return {x.primary for x in self._matches} @property - def primary_unmatched(self) -> set[Item]: + def primary_unmatched(self) -> set[Node]: """ - Set of items unmatched in primary. + Set of nodes unmatched in primary. """ return self._primary_unmatched @property - def secondary_matched(self) -> set[Item]: + def secondary_matched(self) -> set[Node]: """ - Set of items matched in the secondary object. + Set of nodes matched in the secondary object. """ return {x.secondary for x in self._matches} @property - def secondary_unmatched(self) -> set[Item]: + def secondary_unmatched(self) -> set[Node]: """ - Set of items unmatched in the secondary object. + Set of nodes unmatched in the secondary object. """ return self._secondary_unmatched @@ -122,69 +131,122 @@ def nb_match(self) -> int: @property def nb_unmatched_primary(self) -> int: """ - Number of unmatched items in primary. + Number of unmatched nodes in primary. """ return len(self._primary_unmatched) @property def nb_unmatched_secondary(self) -> int: """ - Number of unmatched items in secondary. + Number of unmatched nodes in secondary. """ return len(self._secondary_unmatched) @property - def nb_item_primary(self) -> int: + def nb_nodes_primary(self) -> int: """ - Total number of items in primary + Total number of nodes in primary """ return self.nb_match + self.nb_unmatched_primary @property - def nb_item_secondary(self) -> int: + def nb_nodes_secondary(self) -> int: """ - Total number of items in secondary. + Total number of nodes in secondary. """ return self.nb_match + self.nb_unmatched_secondary - def match_primary(self, item: Item) -> Match | None: + def match_primary(self, node: Node) -> Match | None: """ - Returns the match associated with the given primary item (if any). + Returns the match associated with the given primary node (if any). - :param item: item to match in primary + :param node: node to match in primary :return: optional match """ for m in self._matches: - if m.primary == item: + if m.primary == node: return m return None - def match_secondary(self, item: Item) -> Match | None: + def match_secondary(self, node: Node) -> Match | None: """ - Returns the match associated with the given secondary item (if any). + Returns the match associated with the given secondary node (if any). - :param item: item to match in secondary + :param node: node to match in secondary :return: optional match """ for m in self._matches: - if m.secondary == item: + if m.secondary == node: return m return None - def is_match_primary(self, item: Item) -> bool: + def is_match_primary(self, node: Node) -> bool: + """ + Returns true if the node in primary has been matched with a node in secondary. + + :param node: ndoe to match in primary + :returns: whether the node has been matched + """ + return self.match_primary(node) is not None + + def is_match_secondary(self, node: Node) -> bool: """ - Returns true if the items in primary did match with an item in secondary. + Returns true if the node in secondary has been matched with a node in primary. - :param item: item to match in primary - :return: whether the item is matched in primary + :param node: ndoe to match in secondary + :returns: whether the node has been matched """ - return self.match_primary(item) is not None + return self.match_secondary(node) is not None - def is_match_secondary(self, item: Item) -> bool: + def to_csv(self, path: Path | str, *extra_attrs: * tuple[str, Callable[[Node], Any]]) -> None: """ - Returns true if the item in secondary did match with an item in primary. + Write the mapping into a csv file. + Additional attributes of the nodes to put in the csv can be optionally specified. - :param item: item to match in secondary - :return: whether the item is matched in secondary + For example: + .. code-block:: python + :linenos: + + # Adding the attributes name and type. This will add the fields "primary_name", + # "secondary_name", "primary_type", "secondary_type" + mapping.to_csv("result.csv", ("name", lambda f: f.name), ("type", lambda f: f.type)) + + :param path: The file path of the csv file to write + :param extra_attrs: Additional attributes to put in the csv. Each attribute is a + tuple (attribute_name, attribute_function) """ - return self.match_secondary(item) is not None + + if isinstance(path, str): + path = Path(str) + if not path.exists() or not path.is_file(): + raise ValueError(f"path `{path}` does not exist or is not a file.") + + # Extract the optional extra attributes + attrs_name = [] + attrs_func = [] + for name, func in extra_attrs: + attrs_name.append(f"primary_{name}") + attrs_name.append(f"secondary_{name}") + attrs_func.append(func) + + with open(path, "w") as f: + writer = csv.writer(path, newline="") + writer.writerow( + ("primary_node", "secondary_node", "similarity", "confidence", *attrs_name) + ) + for match in self._matches: + # Get the extra attributes values + extra_values = [] + for func in attrs_func: + extra_values.append(func(match.primary)) + extra_values.append(func(match.secondary)) + + writer.writerow( + ( + match.primary.get_label(), + match.secondary.get_label(), + match.similarity, + match.confidence, + *extra_values, + ) + ) diff --git a/src/qbindiff/types.py b/src/qbindiff/types.py index 5be0b71..0bf4aaa 100644 --- a/src/qbindiff/types.py +++ b/src/qbindiff/types.py @@ -29,7 +29,7 @@ import enum_tools.documentation from enum import IntEnum -from qbindiff.abstract import GenericGraph +from qbindiff.abstract import GenericGraph, GenericNode if TYPE_CHECKING: from qbindiff import Program @@ -60,15 +60,7 @@ An integer representing an address within a program """ -Item: TypeAlias = Any -""" -Item, entity being matched. The only constraint is to be hashable -""" - -Anchors: TypeAlias = list[tuple[Item, Item]] -""" -Pair of lists of user defined index correspondences. Default None. -""" +NodeLabel: TypeAlias = Any # Generic node label RawMapping: TypeAlias = tuple[list[Idx], list[Idx]] """ @@ -80,8 +72,7 @@ Match represent the matching between two functions and can hold the similarity between the two """ - -ExtendedMapping: TypeAlias = Iterable[tuple[Item, Item, float, int]] +ExtendedMapping: TypeAlias = Iterable[tuple[Node, Node, float, int]] """ An extended version of RawMapping with two more lists recording pairing similarity and induced number of squares. """ @@ -121,10 +112,9 @@ Float nxm-Dimensional array. A sparse version of the above SimMatrix """ -Graph: TypeAlias = GenericGraph -""" -A generic Graph, iterable over the nodes -""" +Graph: TypeAlias = GenericGraph # generic Graph, iterable over the nodes + +Node: TypeAlias = GenericNode # Generic node. This is the entity that will be matched SparseVector: TypeAlias = csr_array """ From a2854cec4accd901b3fe123b4287809a26d21218 Mon Sep 17 00:00:00 2001 From: Riccardo Mori Date: Mon, 12 Feb 2024 16:34:55 +0100 Subject: [PATCH 02/20] Fix missing definition --- src/qbindiff/types.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/qbindiff/types.py b/src/qbindiff/types.py index 0bf4aaa..7945a52 100644 --- a/src/qbindiff/types.py +++ b/src/qbindiff/types.py @@ -72,11 +72,6 @@ Match represent the matching between two functions and can hold the similarity between the two """ -ExtendedMapping: TypeAlias = Iterable[tuple[Node, Node, float, int]] -""" -An extended version of RawMapping with two more lists recording pairing similarity and induced number of squares. -""" - Dtype: TypeAlias = numpy.dtype """ Numpy data type @@ -116,6 +111,11 @@ Node: TypeAlias = GenericNode # Generic node. This is the entity that will be matched +ExtendedMapping: TypeAlias = Iterable[tuple[Node, Node, float, int]] +""" +An extended version of RawMapping with two more lists recording pairing similarity and induced number of squares. +""" + SparseVector: TypeAlias = csr_array """ Float n-Dimensional sparse array. From 58f6a64bbe436ed6e7ad8ea823fbbc98cc2853eb Mon Sep 17 00:00:00 2001 From: Riccardo Mori Date: Mon, 12 Feb 2024 16:36:02 +0100 Subject: [PATCH 03/20] Fix missing import --- src/qbindiff/loader/function.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/qbindiff/loader/function.py b/src/qbindiff/loader/function.py index 079a388..dabff64 100644 --- a/src/qbindiff/loader/function.py +++ b/src/qbindiff/loader/function.py @@ -16,6 +16,7 @@ """ from __future__ import annotations +from collections.abc import Mapping from typing import TYPE_CHECKING from qbindiff.abstract import GenericNode @@ -24,7 +25,7 @@ if TYPE_CHECKING: import networkx - from collections.abc import Mapping, Generator + from collections.abc import Generator from qbindiff.loader.backend.abstract import AbstractFunctionBackend from qbindiff.types import Addr From 6204b6ef98fc23a0a647e0175bbbfc81e154af4a Mon Sep 17 00:00:00 2001 From: Riccardo Mori Date: Mon, 12 Feb 2024 16:38:20 +0100 Subject: [PATCH 04/20] Correct usage of abc classes --- src/qbindiff/loader/function.py | 2 +- src/qbindiff/loader/program.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/qbindiff/loader/function.py b/src/qbindiff/loader/function.py index dabff64..1749c25 100644 --- a/src/qbindiff/loader/function.py +++ b/src/qbindiff/loader/function.py @@ -30,7 +30,7 @@ from qbindiff.types import Addr -class Function(Mapping[Addr, BasicBlock], GenericNode): +class Function(Mapping, GenericNode): """ Representation of a binary function. diff --git a/src/qbindiff/loader/program.py b/src/qbindiff/loader/program.py index c5841d8..a3466ad 100644 --- a/src/qbindiff/loader/program.py +++ b/src/qbindiff/loader/program.py @@ -31,7 +31,7 @@ from qbindiff.types import Addr -class Program(MutableMapping[Addr, Function], GenericGraph): +class Program(MutableMapping, GenericGraph): """ Program class that shadows the underlying program backend used. From 1bbc653a214204d8a1666b28dc19444ef1daf5c1 Mon Sep 17 00:00:00 2001 From: Riccardo Mori Date: Mon, 12 Feb 2024 16:39:27 +0100 Subject: [PATCH 05/20] Fix missing import --- src/qbindiff/loader/program.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/qbindiff/loader/program.py b/src/qbindiff/loader/program.py index a3466ad..a8f295c 100644 --- a/src/qbindiff/loader/program.py +++ b/src/qbindiff/loader/program.py @@ -16,6 +16,7 @@ """ from __future__ import annotations +from collections.abc import MutableMapping from typing import TYPE_CHECKING from qbindiff.abstract import GenericGraph From 3a60187644c5ca76203253f8876144e6abd4db45 Mon Sep 17 00:00:00 2001 From: Riccardo Mori Date: Mon, 12 Feb 2024 16:40:33 +0100 Subject: [PATCH 06/20] Fix missing parameter --- src/qbindiff/loader/program.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/qbindiff/loader/program.py b/src/qbindiff/loader/program.py index a8f295c..e671803 100644 --- a/src/qbindiff/loader/program.py +++ b/src/qbindiff/loader/program.py @@ -133,7 +133,7 @@ def __getitem__(self, key): return self._functions.__getitem__(key) def __setitem__(self, key, value): - self._functions.__setitem__(key) + self._functions.__setitem__(key, value) def __delitem__(self, key): self._functions.__delitem__(key) From cddb8068c7e6dda782291433b92ca140b64acac9 Mon Sep 17 00:00:00 2001 From: Riccardo Mori Date: Mon, 12 Feb 2024 16:43:01 +0100 Subject: [PATCH 07/20] Fix recursion issue --- src/qbindiff/loader/program.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/qbindiff/loader/program.py b/src/qbindiff/loader/program.py index e671803..fbd3cd2 100644 --- a/src/qbindiff/loader/program.py +++ b/src/qbindiff/loader/program.py @@ -173,7 +173,7 @@ def node_labels(self) -> Iterator[Addr]: :returns: An :py:class:`Iterator` over the functions' address """ - yield from filter(self._filter, self.keys()) + yield from filter(self._filter, self._functions.keys()) @property def nodes(self) -> Iterator[Function]: From 71fd9880736fb61e79cfd884a55bffde40482187 Mon Sep 17 00:00:00 2001 From: Riccardo Mori Date: Mon, 12 Feb 2024 16:49:14 +0100 Subject: [PATCH 08/20] Use correct signature for Program.__iter__ --- src/qbindiff/loader/program.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/qbindiff/loader/program.py b/src/qbindiff/loader/program.py index fbd3cd2..b3c6a33 100644 --- a/src/qbindiff/loader/program.py +++ b/src/qbindiff/loader/program.py @@ -119,12 +119,12 @@ def __repr__(self) -> str: def __iter__(self) -> Iterator[Addr]: """ - Iterate over all functions' address located in the program. + Iterate over all functions located in the program, using the filter registered. - :return: Iterator of all functions' address + :return: Iterator of all the functions """ - yield from self.node_labels + yield from self._functions.values() def __len__(self) -> int: return len(self._functions) From 2d0cd3f4e2981852d906f914c0075e2e301a1e4e Mon Sep 17 00:00:00 2001 From: Riccardo Mori Date: Mon, 12 Feb 2024 17:02:10 +0100 Subject: [PATCH 09/20] Fix Program.items yield bug --- src/qbindiff/loader/program.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/qbindiff/loader/program.py b/src/qbindiff/loader/program.py index b3c6a33..e009efc 100644 --- a/src/qbindiff/loader/program.py +++ b/src/qbindiff/loader/program.py @@ -153,7 +153,7 @@ def items(self) -> Iterator[tuple[Addr, Function]]: """ # yield function only if filter agree to keep it - yield from (lambda i: self._filter(i[0]), self._functions.items()) + yield from filter(lambda i: self._filter(i[0]), self._functions.items()) def get_node(self, node_label: Addr) -> Function: """ From 960f467f7caf76bdb9fd9a2b832f7f352efdbe9f Mon Sep 17 00:00:00 2001 From: Riccardo Mori Date: Mon, 12 Feb 2024 17:40:35 +0100 Subject: [PATCH 10/20] Small bugfixes and improved logging --- src/qbindiff/__main__.py | 8 +++++--- src/qbindiff/mapping/mapping.py | 12 +++++++----- 2 files changed, 12 insertions(+), 8 deletions(-) diff --git a/src/qbindiff/__main__.py b/src/qbindiff/__main__.py index bfd38ae..aa22143 100644 --- a/src/qbindiff/__main__.py +++ b/src/qbindiff/__main__.py @@ -209,9 +209,9 @@ def list_features(ctx: click.Context, param: click.Parameter, value: Any) -> Non "-ff", "--file-format", show_default=True, - default="bindiff", - type=click.Choice(["bindiff"]), - help=f"The file format of the output file. Supported formats are [bindiff]", + default="csv", + type=click.Choice(["bindiff", "csv"]), + help=f"The file format of the output file", ) @click.option( "-v", @@ -385,6 +385,8 @@ def main( logging.info("[+] Saving") if file_format == "bindiff": qbindiff.export_to_bindiff(output) + elif file_format == "csv": + qbindiff.mapping.to_csv(output, ("name", lambda f: f.name)) logging.info("[+] Mapping successfully saved to: %s" % output) diff --git a/src/qbindiff/mapping/mapping.py b/src/qbindiff/mapping/mapping.py index 87241e6..1568375 100644 --- a/src/qbindiff/mapping/mapping.py +++ b/src/qbindiff/mapping/mapping.py @@ -16,7 +16,7 @@ """ from __future__ import annotations -import csv +import csv, logging from typing import TYPE_CHECKING from qbindiff.types import Match @@ -218,8 +218,10 @@ def to_csv(self, path: Path | str, *extra_attrs: * tuple[str, Callable[[Node], A if isinstance(path, str): path = Path(str) - if not path.exists() or not path.is_file(): - raise ValueError(f"path `{path}` does not exist or is not a file.") + if path.exists() and not path.is_file(): + raise ValueError(f"path `{path}` already exists and is not a file.") + if path.exists(): + logging.info(f"Overwriting file {path}") # Extract the optional extra attributes attrs_name = [] @@ -229,8 +231,8 @@ def to_csv(self, path: Path | str, *extra_attrs: * tuple[str, Callable[[Node], A attrs_name.append(f"secondary_{name}") attrs_func.append(func) - with open(path, "w") as f: - writer = csv.writer(path, newline="") + with open(path, "w", newline="") as f: + writer = csv.writer(f) writer.writerow( ("primary_node", "secondary_node", "similarity", "confidence", *attrs_name) ) From 398365fc9a812d4e26ab569230a4c7e41e4ce3d8 Mon Sep 17 00:00:00 2001 From: Riccardo Mori Date: Tue, 13 Feb 2024 10:28:07 +0100 Subject: [PATCH 11/20] Add csv exporter in README --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 54aeed8..1b760cf 100644 --- a/README.md +++ b/README.md @@ -115,8 +115,8 @@ The complete command line options are: -e1, --executable1 PATH Path to the primary raw executable. Must be provided if using quokka loader -e2, --executable2 PATH Path to the secondary raw executable. Must be provided if using quokka loader -o, --output PATH Write output to PATH - -ff, --file-format [bindiff] The file format of the output file. Supported formats are [bindiff] [default: - bindiff] + -ff, --file-format [bindiff|csv] + The file format of the output file [default: csv] -v, --verbose Activate debugging messages. Can be supplied multiple times to increase verbosity --version Show the version and exit. --arch-primary TEXT Force the architecture when disassembling for the primary. Format is From f9b97701f14b3d07f445504e1471975293c4c6aa Mon Sep 17 00:00:00 2001 From: Riccardo Mori Date: Tue, 13 Feb 2024 11:04:43 +0100 Subject: [PATCH 12/20] Use attribute name as default behavior in Mapping.to_csv --- src/qbindiff/mapping/mapping.py | 29 +++++++++++++++++++++-------- 1 file changed, 21 insertions(+), 8 deletions(-) diff --git a/src/qbindiff/mapping/mapping.py b/src/qbindiff/mapping/mapping.py index 1568375..e47d48c 100644 --- a/src/qbindiff/mapping/mapping.py +++ b/src/qbindiff/mapping/mapping.py @@ -26,6 +26,8 @@ from typing import Callable from qbindiff.types import ExtendedMapping, Node + ExtraAttrsType: TypeAlias = str | tuple[str, Callable[[Node], Any]] + class Mapping: """ @@ -198,7 +200,7 @@ def is_match_secondary(self, node: Node) -> bool: """ return self.match_secondary(node) is not None - def to_csv(self, path: Path | str, *extra_attrs: * tuple[str, Callable[[Node], Any]]) -> None: + def to_csv(self, path: Path | str, *extra_attrs: *ExtraAttrsType) -> None: """ Write the mapping into a csv file. Additional attributes of the nodes to put in the csv can be optionally specified. @@ -207,15 +209,20 @@ def to_csv(self, path: Path | str, *extra_attrs: * tuple[str, Callable[[Node], A .. code-block:: python :linenos: + # Adding the attribute "primary_addr" and "secondary_addr". The value will be obtained + # by accessing `function.addr` + mapping.to_csv("result.csv", "addr") + # Adding the attributes name and type. This will add the fields "primary_name", # "secondary_name", "primary_type", "secondary_type" - mapping.to_csv("result.csv", ("name", lambda f: f.name), ("type", lambda f: f.type)) + mapping.to_csv("result.csv", ("name", lambda f: f.name.upper()), "type") :param path: The file path of the csv file to write - :param extra_attrs: Additional attributes to put in the csv. Each attribute is a - tuple (attribute_name, attribute_function) + :param extra_attrs: Additional attributes to put in the csv. Each attribute is either a + tuple (attribute_name, attribute_function) or a string *attribute_name* """ + # Check the path if isinstance(path, str): path = Path(str) if path.exists() and not path.is_file(): @@ -226,10 +233,16 @@ def to_csv(self, path: Path | str, *extra_attrs: * tuple[str, Callable[[Node], A # Extract the optional extra attributes attrs_name = [] attrs_func = [] - for name, func in extra_attrs: - attrs_name.append(f"primary_{name}") - attrs_name.append(f"secondary_{name}") - attrs_func.append(func) + for extra_attr in extra_attrs: + match extra_attr: + case str(name): + attrs_name.append(f"primary_{name}") + attrs_name.append(f"secondary_{name}") + attrs_func.append(lambda f: getattr(f, name)) + case (name, func): + attrs_name.append(f"primary_{name}") + attrs_name.append(f"secondary_{name}") + attrs_func.append(func) with open(path, "w", newline="") as f: writer = csv.writer(f) From 93b6e9ec2b0c936ac57cb2376a5a9b16880c63bb Mon Sep 17 00:00:00 2001 From: Riccardo Mori Date: Tue, 13 Feb 2024 11:05:15 +0100 Subject: [PATCH 13/20] Reformat comments. NFC --- src/qbindiff/types.py | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/src/qbindiff/types.py b/src/qbindiff/types.py index 7945a52..5d780da 100644 --- a/src/qbindiff/types.py +++ b/src/qbindiff/types.py @@ -121,15 +121,9 @@ Float n-Dimensional sparse array. """ -PathLike: TypeAlias = str | Path -""" -Path -""" +PathLike: TypeAlias = str | Path # Path -NodeLabel: TypeAlias = Any -""" -The node label of a generic graph -""" +NodeLabel: TypeAlias = Any # The node label of a generic graph class GenericPrePass(Protocol): From 34073a44514330cd1136430d272509e1a3cae782 Mon Sep 17 00:00:00 2001 From: Riccardo Mori Date: Tue, 13 Feb 2024 11:21:11 +0100 Subject: [PATCH 14/20] [doc] Adjust documentation of the csv exporter --- doc/source/export.rst | 47 +++++++++++++++++-------------------------- 1 file changed, 18 insertions(+), 29 deletions(-) diff --git a/doc/source/export.rst b/doc/source/export.rst index ea471fe..9c9cc62 100644 --- a/doc/source/export.rst +++ b/doc/source/export.rst @@ -15,43 +15,32 @@ Given a ``differ`` object initialized, with two binaries to diffs, the diffing a .. code-block:: python matches = differ.compute_matching() - differ.export_to_bindiff('/path/to/output.BinDiff')) + differ.export_to_bindiff('/path/to/output.BinDiff') CSV --- If the diff, does not represent a binary diff, or for further processing the diff -can also be saved in .csv file. +it can also be saved in .csv file. +This is the default file format as it is very lightweight and fast to generate. -TODO: We really have to write the CSV ourselves ? There is not utility functions? +It can either be obtained using the CLI option `-ff csv` or by calling the right API as follows: .. code-block:: python - import csv + from qbindiff.loader.types import FunctionType - matches = differ.compute_matching() + matches: Mapping = differ.compute_matching() + + # This only exports base fields (address, similarity, confidence) + matches.to_csv("/path/to/output.csv") + + # Add extra "name" field + matches.to_csv("/path/to/output.csv", "name") - with open('/path/to/output.csv', 'w') as f: - writer = csv.writer(f) - writer.writerow(( - 'path_primary', - 'func_addr_primary', - 'func_name_primary', - 'path_secondary', - 'func_addr_secondary', - 'func_name_secondary', - 'similarity', - 'confidence' - )) - - for match in matches: - writer.writerow(( - differ.primary.name, - hex(match.primary.addr), - match.primary.name, - differ.secondary.name, - hex(match.secondary.addr), - match.primary.name, - match.similarity, - match.confidence - )) + # Add extra "name" field and custom field + matches.to_csv( + "/path/to/output.csv", + "name", + ("is_library", lambda f: f.type == FunctionType.library) + ) From c65508fba1b4913a62d01e60248359e9443365c1 Mon Sep 17 00:00:00 2001 From: Riccardo Mori Date: Tue, 13 Feb 2024 12:02:47 +0100 Subject: [PATCH 15/20] [doc] small UI enhancement --- doc/source/export.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/export.rst b/doc/source/export.rst index 9c9cc62..bad61e9 100644 --- a/doc/source/export.rst +++ b/doc/source/export.rst @@ -24,7 +24,7 @@ If the diff, does not represent a binary diff, or for further processing the dif it can also be saved in .csv file. This is the default file format as it is very lightweight and fast to generate. -It can either be obtained using the CLI option `-ff csv` or by calling the right API as follows: +It can either be obtained using the CLI option ``-ff csv`` or by calling the right API as follows: .. code-block:: python From bcce5b68024d596a5d240dd71c56ffd630b815df Mon Sep 17 00:00:00 2001 From: Riccardo Mori Date: Tue, 13 Feb 2024 12:14:28 +0100 Subject: [PATCH 16/20] [doc] Fix rst. NFC --- src/qbindiff/loader/program.py | 2 +- src/qbindiff/mapping/mapping.py | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/src/qbindiff/loader/program.py b/src/qbindiff/loader/program.py index e009efc..6a12938 100644 --- a/src/qbindiff/loader/program.py +++ b/src/qbindiff/loader/program.py @@ -157,7 +157,7 @@ def items(self) -> Iterator[tuple[Addr, Function]]: def get_node(self, node_label: Addr) -> Function: """ - Get the function identified by the address :paramref:`node_label` + Get the function identified by the address ``node_label`` :param node_label: the address of the function that will be returned :returns: the function identified by its address diff --git a/src/qbindiff/mapping/mapping.py b/src/qbindiff/mapping/mapping.py index e47d48c..614dcde 100644 --- a/src/qbindiff/mapping/mapping.py +++ b/src/qbindiff/mapping/mapping.py @@ -206,6 +206,7 @@ def to_csv(self, path: Path | str, *extra_attrs: *ExtraAttrsType) -> None: Additional attributes of the nodes to put in the csv can be optionally specified. For example: + .. code-block:: python :linenos: From 9bd2e57d73f4b8469afd4d7225855aef4d505689 Mon Sep 17 00:00:00 2001 From: Riccardo Mori Date: Tue, 13 Feb 2024 12:19:53 +0100 Subject: [PATCH 17/20] [doc] Fix rst and add missing typing import --- src/qbindiff/mapping/mapping.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/qbindiff/mapping/mapping.py b/src/qbindiff/mapping/mapping.py index 614dcde..4686a36 100644 --- a/src/qbindiff/mapping/mapping.py +++ b/src/qbindiff/mapping/mapping.py @@ -23,7 +23,7 @@ if TYPE_CHECKING: from pathlib import Path - from typing import Callable + from typing import Callable, Any from qbindiff.types import ExtendedMapping, Node ExtraAttrsType: TypeAlias = str | tuple[str, Callable[[Node], Any]] @@ -32,7 +32,7 @@ class Mapping: """ This class represents an interface to access the result of the matching analysis. - Its interface is independent of the underlying :py:obj:`Node`s manipulated. + Its interface is independent of the underlying :py:obj:`Node` type manipulated. """ def __init__( From 1008bbaa050cb6e6a4f2f1b38675ce935d079350 Mon Sep 17 00:00:00 2001 From: Riccardo Mori Date: Tue, 13 Feb 2024 12:24:22 +0100 Subject: [PATCH 18/20] [doc] Fix rst. NFC --- src/qbindiff/abstract.py | 2 +- src/qbindiff/loader/program.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/qbindiff/abstract.py b/src/qbindiff/abstract.py index 34a1f04..a6c77f1 100644 --- a/src/qbindiff/abstract.py +++ b/src/qbindiff/abstract.py @@ -100,6 +100,6 @@ def edges(self) -> Iterable[tuple[NodeLabel, NodeLabel]]: """ Iterate over the edges. An edge is a pair (node_label_a, node_label_b) - :returns: An :py:class`Iterable` over the edges. + :returns: An :py:class:`Iterable` over the edges. """ raise NotImplementedError() diff --git a/src/qbindiff/loader/program.py b/src/qbindiff/loader/program.py index 6a12938..39a0d12 100644 --- a/src/qbindiff/loader/program.py +++ b/src/qbindiff/loader/program.py @@ -190,7 +190,7 @@ def edges(self) -> OutEdgeView[Addr, Addr]: """ Iterate over the edges. An edge is a pair (addr_a, addr_b) - :returns: An :py:class`OutEdgeView` over the edges. + :returns: An :py:class:`OutEdgeView` over the edges. """ return self.callgraph.edges From 27028e37215691eccf557142d4b5ee4149f99364 Mon Sep 17 00:00:00 2001 From: Riccardo Mori Date: Tue, 13 Feb 2024 12:34:39 +0100 Subject: [PATCH 19/20] [doc] Use sphinx comments. NFC --- src/qbindiff/types.py | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/src/qbindiff/types.py b/src/qbindiff/types.py index 5d780da..ebb29d1 100644 --- a/src/qbindiff/types.py +++ b/src/qbindiff/types.py @@ -60,7 +60,7 @@ An integer representing an address within a program """ -NodeLabel: TypeAlias = Any # Generic node label +NodeLabel: TypeAlias = Any #: The node label of a generic graph RawMapping: TypeAlias = tuple[list[Idx], list[Idx]] """ @@ -107,9 +107,9 @@ Float nxm-Dimensional array. A sparse version of the above SimMatrix """ -Graph: TypeAlias = GenericGraph # generic Graph, iterable over the nodes +Graph: TypeAlias = GenericGraph #: generic Graph, iterable over the nodes -Node: TypeAlias = GenericNode # Generic node. This is the entity that will be matched +Node: TypeAlias = GenericNode #: Generic node. This is the entity that will be matched ExtendedMapping: TypeAlias = Iterable[tuple[Node, Node, float, int]] """ @@ -121,9 +121,7 @@ Float n-Dimensional sparse array. """ -PathLike: TypeAlias = str | Path # Path - -NodeLabel: TypeAlias = Any # The node label of a generic graph +PathLike: TypeAlias = str | Path #: Path class GenericPrePass(Protocol): From 9743497a6fc659370383fd41e8bc2329b7462b10 Mon Sep 17 00:00:00 2001 From: Riccardo Mori Date: Tue, 13 Feb 2024 12:34:57 +0100 Subject: [PATCH 20/20] [doc] Add GenericNode API documentation --- doc/source/api/differ.rst | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/doc/source/api/differ.rst b/doc/source/api/differ.rst index 1ef1e14..a655443 100644 --- a/doc/source/api/differ.rst +++ b/doc/source/api/differ.rst @@ -11,6 +11,16 @@ GenericGraph :undoc-members: :exclude-members: +GenericNode +----------- + +.. autoclass:: qbindiff.GenericNode + :members: + :show-inheritance: + :inherited-members: + :undoc-members: + :exclude-members: + Differ ------