From a557d20e551cac30ceb8003e61ce29fc0110155f Mon Sep 17 00:00:00 2001 From: Riccardo Mori Date: Wed, 9 Aug 2023 14:39:11 +0200 Subject: [PATCH 1/3] Add black formatter --- bin/binexporter | 39 +++++++++++++++++++++++++-------------- binexport/basic_block.py | 7 ++++++- binexport/expression.py | 32 ++++++++++++++++++-------------- binexport/function.py | 26 ++++++++++++++++---------- binexport/instruction.py | 8 +++++++- binexport/operand.py | 12 +++++++----- binexport/program.py | 28 ++++++++++++++++------------ binexport/types.py | 6 ++++++ pyproject.toml | 7 +++++++ 9 files changed, 108 insertions(+), 57 deletions(-) create mode 100644 pyproject.toml diff --git a/bin/binexporter b/bin/binexporter index be2b9c4..9d1c7a8 100755 --- a/bin/binexporter +++ b/bin/binexporter @@ -10,21 +10,28 @@ import click from binexport import ProgramBinExport -BINARY_FORMAT = {'application/x-dosexec', - 'application/x-sharedlib', - 'application/x-mach-binary', - 'application/x-executable', - 'application/x-pie-executable'} +BINARY_FORMAT = { + "application/x-dosexec", + "application/x-sharedlib", + "application/x-mach-binary", + "application/x-executable", + "application/x-pie-executable", +} -EXTENSIONS_WHITELIST = {'application/octet-stream': ['.dex']} +EXTENSIONS_WHITELIST = {"application/octet-stream": [".dex"]} -CONTEXT_SETTINGS = dict(help_option_names=['-h', '--help'], - max_content_width=300) +CONTEXT_SETTINGS = dict(help_option_names=["-h", "--help"], max_content_width=300) @click.command(context_settings=CONTEXT_SETTINGS) -@click.option('-i', '--ida-path', type=click.Path(exists=True), default=None, help="IDA Pro installation directory") -@click.option('-v', '--verbose', count=True, help="To activate or not the verbosity") +@click.option( + "-i", + "--ida-path", + type=click.Path(exists=True), + default=None, + help="IDA Pro installation directory", +) +@click.option("-v", "--verbose", count=True, help="To activate or not the verbosity") @click.argument("input_file", type=click.Path(exists=True), metavar="") def main(ida_path: str, input_file: str, verbose: bool) -> None: """ @@ -38,14 +45,18 @@ def main(ida_path: str, input_file: str, verbose: bool) -> None: :return: None """ - logging.basicConfig(format='[%(levelname)s] %(message)s', level=logging.DEBUG if verbose else logging.INFO) + logging.basicConfig( + format="[%(levelname)s] %(message)s", level=logging.DEBUG if verbose else logging.INFO + ) if ida_path: - os.environ['IDA_PATH'] = pathlib.Path(ida_path).absolute().as_posix() + os.environ["IDA_PATH"] = pathlib.Path(ida_path).absolute().as_posix() mime_type = magic.from_file(input_file, mime=True) input_file = pathlib.Path(input_file) - if mime_type not in BINARY_FORMAT and input_file.suffix not in EXTENSIONS_WHITELIST.get(mime_type, []): + if mime_type not in BINARY_FORMAT and input_file.suffix not in EXTENSIONS_WHITELIST.get( + mime_type, [] + ): logging.error("the file is not an executable file") exit(1) @@ -56,5 +67,5 @@ def main(ida_path: str, input_file: str, verbose: bool) -> None: exit(1) -if __name__ == '__main__': +if __name__ == "__main__": main() diff --git a/binexport/basic_block.py b/binexport/basic_block.py index 04ef519..d35e2e3 100644 --- a/binexport/basic_block.py +++ b/binexport/basic_block.py @@ -14,7 +14,12 @@ class BasicBlockBinExport(OrderedDict): methods to access instructions. """ - def __init__(self, program: weakref.ref["ProgramBinExport"], function: weakref.ref["FunctionBinExport"], pb_bb: "BinExport2.BasicBlock"): + def __init__( + self, + program: weakref.ref["ProgramBinExport"], + function: weakref.ref["FunctionBinExport"], + pb_bb: "BinExport2.BasicBlock", + ): """ :param program: Weak reference to the program :param function: Weak reference to the function diff --git a/binexport/expression.py b/binexport/expression.py index fe13fe8..2a4b568 100644 --- a/binexport/expression.py +++ b/binexport/expression.py @@ -47,12 +47,14 @@ class ExpressionBinExport: 64: "zmmword", } - def __init__(self, - program: "ProgramBinExport", - function: "FunctionBinExport", - instruction: "InstructionBinExport", - exp_idx: int, - parent: ExpressionBinExport | None = None): + def __init__( + self, + program: "ProgramBinExport", + function: "FunctionBinExport", + instruction: "InstructionBinExport", + exp_idx: int, + parent: ExpressionBinExport | None = None, + ): """ :param program: reference to program :param function: reference to function @@ -101,10 +103,12 @@ def depth(self) -> int: return 0 return self.parent.depth + 1 - def _parse_protobuf(self, - program: "ProgramBinExport", - function: "FunctionBinExport", - instruction: "InstructionBinExport") -> None: + def _parse_protobuf( + self, + program: "ProgramBinExport", + function: "FunctionBinExport", + instruction: "InstructionBinExport", + ) -> None: """ Low-level expression parser. It populates self._type and self._value """ @@ -123,9 +127,7 @@ def _parse_protobuf(self, if self.pb_expr.immediate in instruction.data_refs: # Data self.is_addr = True self.is_data = True - elif ( - self.pb_expr.immediate in program or self.pb_expr.immediate in function - ): # Address + elif self.pb_expr.immediate in program or self.pb_expr.immediate in function: # Address self.is_addr = True elif self.pb_expr.type == BinExport2.Expression.IMMEDIATE_FLOAT: @@ -149,4 +151,6 @@ def _parse_protobuf(self, self._value = self.pb_expr.symbol else: - logging.error(f"Malformed protobuf message. Invalid expression type {self.pb_expr.type}") + logging.error( + f"Malformed protobuf message. Invalid expression type {self.pb_expr.type}" + ) diff --git a/binexport/function.py b/binexport/function.py index ce60cc9..0c4d3a4 100644 --- a/binexport/function.py +++ b/binexport/function.py @@ -16,12 +16,14 @@ class FunctionBinExport: Also references its parents and children (function it calls). """ - def __init__(self, - program: weakref.ref["ProgramBinExport"], - *, - pb_fun: "BinExport2.FlowGraph | None" = None, - is_import: bool = False, - addr: Addr | None = None): + def __init__( + self, + program: weakref.ref["ProgramBinExport"], + *, + pb_fun: "BinExport2.FlowGraph | None" = None, + is_import: bool = False, + addr: Addr | None = None, + ): """ Constructor. Iterates the FlowGraph structure and initialize all the basic blocks and instruction accordingly. @@ -34,8 +36,8 @@ def __init__(self, super(FunctionBinExport, self).__init__() self.addr: Addr | None = addr #: address, None if imported function - self.parents: Set['FunctionBinExport'] = set() #: set of function call this one - self.children: Set['FunctionBinExport'] = set() #: set of functions called by this one + self.parents: Set["FunctionBinExport"] = set() #: set of function call this one + self.children: Set["FunctionBinExport"] = set() #: set of functions called by this one # Private attributes self._graph = None # CFG. Loaded inside self.blocks @@ -134,10 +136,14 @@ def blocks(self) -> Dict[Addr, BasicBlockBinExport]: # Load the basic blocks bb_i2a = {} # Map {basic block index -> basic block address} for bb_idx in self._pb_fun.basic_block_index: - basic_block = BasicBlockBinExport(self._program, weakref.ref(self), self.program.proto.basic_block[bb_idx]) + basic_block = BasicBlockBinExport( + self._program, weakref.ref(self), self.program.proto.basic_block[bb_idx] + ) if basic_block.addr in bblocks: - logging.error(f"0x{self.addr:x} basic block address (0x{basic_block.addr:x}) already in(idx:{bb_idx})") + logging.error( + f"0x{self.addr:x} basic block address (0x{basic_block.addr:x}) already in(idx:{bb_idx})" + ) bblocks[basic_block.addr] = basic_block bb_i2a[bb_idx] = basic_block.addr diff --git a/binexport/instruction.py b/binexport/instruction.py index 3d5a28a..043dace 100644 --- a/binexport/instruction.py +++ b/binexport/instruction.py @@ -11,7 +11,13 @@ class InstructionBinExport: Instruction class. It represents an instruction with its operands. """ - def __init__(self, program: weakref.ref["ProgramBinExport"], function: weakref.ref["FunctionBinExport"], addr: Addr, i_idx: int): + def __init__( + self, + program: weakref.ref["ProgramBinExport"], + function: weakref.ref["FunctionBinExport"], + addr: Addr, + i_idx: int, + ): """ :param program: Weak reference to the program :param function: Weak reference to the function diff --git a/binexport/operand.py b/binexport/operand.py index b5acd1d..afcfc30 100644 --- a/binexport/operand.py +++ b/binexport/operand.py @@ -12,11 +12,13 @@ class OperandBinExport: Provide access to the underlying expression. """ - def __init__(self, - program: weakref.ref["ProgramBinExport"], - function: weakref.ref["FunctionBinExport"], - instruction: weakref.ref["InstructionBinExport"], - op_idx: int): + def __init__( + self, + program: weakref.ref["ProgramBinExport"], + function: weakref.ref["FunctionBinExport"], + instruction: weakref.ref["InstructionBinExport"], + op_idx: int, + ): """ :param program: Weak reference to the program :param function: Weak reference to the function diff --git a/binexport/program.py b/binexport/program.py index d8d636a..80771b4 100644 --- a/binexport/program.py +++ b/binexport/program.py @@ -26,14 +26,13 @@ def __init__(self, file: pathlib.Path | str): self._pb = BinExport2() with open(file, "rb") as f: self._pb.ParseFromString(f.read()) - self.mask = ( - 0xFFFFFFFF if self.architecture.endswith("32") else 0xFFFFFFFFFFFFFFFF - ) - self.fun_names: Dict[str, 'FunctionBinExport'] = {} #: dictionary function name -> name + self.mask = 0xFFFFFFFF if self.architecture.endswith("32") else 0xFFFFFFFFFFFFFFFF + self.fun_names: Dict[str, "FunctionBinExport"] = {} #: dictionary function name -> name self.callgraph: networkx.DiGraph = networkx.DiGraph() #: program callgraph (as Digraph) # Make the data refs map {instruction index -> address referred} - self.data_refs: Dict[int, Set[Addr]] = defaultdict(set) #: dictionary of instruction index to set of refs + # dictionary of instruction index to set of refs + self.data_refs: Dict[int, Set[Addr]] = defaultdict(set) for entry in self.proto.data_reference: self.data_refs[entry.instruction_index].add(entry.address) @@ -75,9 +74,7 @@ def __init__(self, file: pathlib.Path | str): ) count_imp += 1 if node.address not in self: - logging.error( - f"Missing function address: 0x{node.address:x} ({node.type})" - ) + logging.error(f"Missing function address: 0x{node.address:x} ({node.type})") continue self[node.address].type = FunctionType.from_proto(node.type) @@ -98,7 +95,8 @@ def __init__(self, file: pathlib.Path | str): self.fun_names[f.name] = f logging.debug( - f"total all:{count_f}, imported:{count_imp} collision:{coll} (total:{count_f + count_imp + coll})") + f"total all:{count_f}, imported:{count_imp} collision:{coll} (total:{count_f + count_imp + coll})" + ) def __repr__(self) -> str: return f"<{type(self).__name__}:{self.name}>" @@ -108,7 +106,7 @@ def from_binary_file( exec_file: pathlib.Path | str, output_file: str | pathlib.Path = "", open_export: bool = True, - override: bool = False + override: bool = False, ) -> "ProgramBinExport | None": """ Generate the .BinExport file for the given program and return an instance @@ -126,7 +124,11 @@ def from_binary_file( from idascript import IDA exec_file = pathlib.Path(exec_file) - binexport_file = pathlib.Path(output_file) if output_file else pathlib.Path(str(exec_file)+".BinExport") + binexport_file = ( + pathlib.Path(output_file) + if output_file + else pathlib.Path(str(exec_file) + ".BinExport") + ) # If the binexport file already exists, do not want to override just return if binexport_file.exists() and not override: @@ -148,7 +150,9 @@ def from_binary_file( if retcode != 0 and not binexport_file.exists(): # Still continue if retcode != 0, because idat64 something crashes but still manage to export file - logging.warning(f"{exec_file.name} failed to export [ret:{retcode}, binexport:{binexport_file.exists()}]") + logging.warning( + f"{exec_file.name} failed to export [ret:{retcode}, binexport:{binexport_file.exists()}]" + ) return None if binexport_file.exists(): diff --git a/binexport/types.py b/binexport/types.py index b00bbcf..0bb91ff 100644 --- a/binexport/types.py +++ b/binexport/types.py @@ -9,17 +9,20 @@ Addr: TypeAlias = int """An integer representing an address within a program""" + @enum_tools.documentation.document_enum class FunctionType(enum.Enum): """ Function types as defined by IDA """ + # fmt: off NORMAL = enum.auto() # doc: Normal function LIBRARY = enum.auto() # doc: library function IMPORTED = enum.auto() # doc: imported function (don't have content) THUNK = enum.auto() # doc: thunk function (trampoline to another function) INVALID = enum.auto() # doc: invalid function (as computed by IDA) + # fmt: on @staticmethod def from_proto(function_type: BinExport2.CallGraph.Vertex.Type) -> FunctionType: @@ -33,12 +36,14 @@ def from_proto(function_type: BinExport2.CallGraph.Vertex.Type) -> FunctionType: return mapping.get(function_type, FunctionType.INVALID) + @enum_tools.documentation.document_enum class ExpressionType(enum.Enum): """ Expression type derived from protobuf expression types. """ + # fmt: off FUNC_NAME = enum.auto() # doc: function name VAR_NAME = enum.auto() # doc: variable name IMMEDIATE_INT = enum.auto() # doc: immediate value @@ -46,3 +51,4 @@ class ExpressionType(enum.Enum): SYMBOL = enum.auto() # doc: symbol expression REGISTER = enum.auto() # doc: register expression SIZE = enum.auto() # doc: size expression (byte, dword ..) + # fmt: on diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..e31cd35 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,7 @@ +[build-system] +requires = ["setuptools"] +build-backend = "setuptools.build_meta" + +[tool.black] +line-length = 100 +target-version = ['py310'] From 8a6f78011909053d3f06f001d1acfb0fc1548060 Mon Sep 17 00:00:00 2001 From: Riccardo Mori Date: Wed, 9 Aug 2023 15:04:01 +0200 Subject: [PATCH 2/3] Separate the API between cached/uncached methods --- binexport/basic_block.py | 69 ++++++++++++++++++++++++++++++++-------- binexport/function.py | 21 ++++++++++-- binexport/instruction.py | 16 ++++++++-- binexport/operand.py | 18 +++++++++-- 4 files changed, 102 insertions(+), 22 deletions(-) diff --git a/binexport/basic_block.py b/binexport/basic_block.py index d35e2e3..b298845 100644 --- a/binexport/basic_block.py +++ b/binexport/basic_block.py @@ -1,17 +1,13 @@ import weakref -from collections import OrderedDict -from typing import Optional from binexport.utils import instruction_index_range, get_instruction_address from binexport.instruction import InstructionBinExport from binexport.types import Addr -class BasicBlockBinExport(OrderedDict): +class BasicBlockBinExport: """ - Basic block. - It inherits OrderdDict, so one can use any dictionary - methods to access instructions. + Basic block class. """ def __init__( @@ -29,8 +25,10 @@ def __init__( super(BasicBlockBinExport, self).__init__() self._program = program - self.addr: Addr = None #: basic bloc address + self._function = function + self.pb_bb = pb_bb + self.addr: Addr = None #: basic bloc address self.bytes = b"" #: bytes of the basic block # Ranges are in fact the true basic blocks but BinExport @@ -39,15 +37,11 @@ def __init__( # might be merged into a single basic block so the edge gets lost. for rng in pb_bb.instruction_index: for idx in instruction_index_range(rng): - pb_inst = self.program.proto.instruction[idx] - inst_addr = get_instruction_address(self.program.proto, idx) + self.bytes += self.program.proto.instruction[idx].raw_bytes # The first instruction determines the basic block address if self.addr is None: - self.addr = inst_addr - - self.bytes += pb_inst.raw_bytes - self[inst_addr] = InstructionBinExport(self._program, function, inst_addr, idx) + self.addr = get_instruction_address(self.program.proto, idx) def __hash__(self) -> int: """ @@ -71,3 +65,52 @@ def program(self) -> "ProgramBinExport": :return: object :py:class:`ProgramBinExport`, program associated to the basic block """ return self._program() + + @property + def function(self) -> "FunctionBinExport": + """ + Wrapper on weak reference on FunctionBinExport + + :return: object :py:class:`FunctionBinExport`, function associated to the basic block + """ + return self._function() + + @property + def uncached_instructions(self) -> dict[Addr, InstructionBinExport]: + """ + Returns a dict which is used to reference all the instructions in this basic + block by their address. + The object returned is not cached, calling this function multiple times will + create the same object multiple times. If you want to cache the object you + should use `BasicBlockBinExport.instructions`. + + :return: dictionary of addresses to instructions + """ + + instructions = {} + + # Ranges are in fact the true basic blocks but BinExport + # doesn't have the same basic block semantic and merge multiple basic blocks into one. + # For example: BB_1 -- unconditional_jmp --> BB_2 + # might be merged into a single basic block so the edge gets lost. + for rng in self.pb_bb.instruction_index: + for idx in instruction_index_range(rng): + inst_addr = get_instruction_address(self.program.proto, idx) + + instructions[inst_addr] = InstructionBinExport( + self._program, self._function, inst_addr, idx + ) + + return instructions + + @cached_property + def instructions(self) -> dict[Addr, InstructionBinExport]: + """ + Returns a dict which is used to reference all the instructions in this basic + block by their address. + The object returned is by default cached, to erase the cache delete the attribute. + + :return: dictionary of addresses to instructions + """ + + return self.uncached_instructions diff --git a/binexport/function.py b/binexport/function.py index 0c4d3a4..17cce9b 100644 --- a/binexport/function.py +++ b/binexport/function.py @@ -112,15 +112,18 @@ def program(self) -> "ProgramBinExport": """ return self._program() - @cached_property - def blocks(self) -> Dict[Addr, BasicBlockBinExport]: + @property + def uncached_blocks(self) -> dict[Addr, BasicBlockBinExport]: """ Returns a dict which is used to reference all basic blocks by their address. - The dict is by default cached, to erase the cache delete the attribute. Calling this function will also load the CFG. + The object returned is not cached, calling this function multiple times will + create the same object multiple times. If you want to cache the object you + should use `FunctionBinExport.blocks`. :return: dictionary of addresses to basic blocks """ + # Fast return if it is a imported function if self.is_import(): if self._graph is None: @@ -166,6 +169,18 @@ def blocks(self) -> Dict[Addr, BasicBlockBinExport]: return bblocks + @cached_property + def blocks(self) -> Dict[Addr, BasicBlockBinExport]: + """ + Returns a dict which is used to reference all basic blocks by their address. + Calling this function will also load the CFG. + The dict is by default cached, to erase the cache delete the attribute. + + :return: dictionary of addresses to basic blocks + """ + + return self.uncached_blocks + @property def graph(self) -> networkx.DiGraph: """ diff --git a/binexport/instruction.py b/binexport/instruction.py index 043dace..d45ebc2 100644 --- a/binexport/instruction.py +++ b/binexport/instruction.py @@ -61,13 +61,23 @@ def mnemonic(self) -> str: """ return self.program.proto.mnemonic[self.pb_instr.mnemonic_index].name - @cached_property - def operands(self) -> List[OperandBinExport]: + @property + def uncached_operands(self) -> list[OperandBinExport]: """ Returns a list of the operands instanciated dynamically on-demand. - The list is cached by default, to erase the cache delete the attribute. + The object returned is not cached, calling this function multiple times will + create the same object multiple times. If you want to cache the object you + should use `InstructionBinExport.operands`. """ return [ OperandBinExport(self._program, self._function, weakref.ref(self), op_idx) for op_idx in self.pb_instr.operand_index ] + + @cached_property + def operands(self) -> List[OperandBinExport]: + """ + Returns a list of the operands instanciated dynamically on-demand. + The list is cached by default, to erase the cache delete the attribute. + """ + return self.uncached_operands diff --git a/binexport/operand.py b/binexport/operand.py index afcfc30..c09c917 100644 --- a/binexport/operand.py +++ b/binexport/operand.py @@ -108,12 +108,14 @@ def pb_operand(self) -> "BinExport2.Operand": """ return self.program.proto.operand[self._idx] - @cached_property - def expressions(self) -> List[ExpressionBinExport]: + @property + def uncached_expressions(self) -> List[ExpressionBinExport]: """ Iterates over all the operand expression in a pre-order manner (binary operator first). - The list is cached by default, to erase the cache delete the attribute + The object returned is not cached, calling this function multiple times will + create the same object multiple times. If you want to cache the object you + should use `OperandBinExport.expressions`. """ expr_dict = {} # {expression protobuf idx : ExpressionBinExport} @@ -125,3 +127,13 @@ def expressions(self) -> List[ExpressionBinExport]: self.program, self.function, self.instruction, exp_idx, parent ) return list(expr_dict.values()) + + @cached_property + def expressions(self) -> List[ExpressionBinExport]: + """ + Iterates over all the operand expression in a pre-order manner + (binary operator first). + The list is cached by default, to erase the cache delete the attribute + """ + + return self.expressions From 51b1d00f70034d26389371e9958fd5237cc19ad7 Mon Sep 17 00:00:00 2001 From: Riccardo Mori Date: Wed, 9 Aug 2023 15:49:18 +0200 Subject: [PATCH 3/3] Add missing import --- binexport/basic_block.py | 1 + 1 file changed, 1 insertion(+) diff --git a/binexport/basic_block.py b/binexport/basic_block.py index b298845..94907a7 100644 --- a/binexport/basic_block.py +++ b/binexport/basic_block.py @@ -1,4 +1,5 @@ import weakref +from functools import cached_property from binexport.utils import instruction_index_range, get_instruction_address from binexport.instruction import InstructionBinExport