diff --git a/.github/pyinstaller/pyinstaller.spec b/.github/pyinstaller/pyinstaller.spec index 021a2b294..e392eb5ae 100644 --- a/.github/pyinstaller/pyinstaller.spec +++ b/.github/pyinstaller/pyinstaller.spec @@ -70,7 +70,10 @@ a = Analysis( "qt5", "pyqtwebengine", "pyasn1", + # don't pull in Binary Ninja/IDA bindings that should + # only be installed locally. "binaryninja", + "ida", ], ) diff --git a/CHANGELOG.md b/CHANGELOG.md index b5477b0ad..bb3420f51 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,8 @@ ### New Features +- add IDA v9.0 backend via idalib #2376 @williballenthin + ### Breaking Changes ### New Rules (0) diff --git a/capa/features/extractors/ida/extractor.py b/capa/features/extractors/ida/extractor.py index 806ef8e78..a2b4f7913 100644 --- a/capa/features/extractors/ida/extractor.py +++ b/capa/features/extractors/ida/extractor.py @@ -8,7 +8,6 @@ from typing import List, Tuple, Iterator import idaapi -import ida_nalt import capa.ida.helpers import capa.features.extractors.elf @@ -32,7 +31,9 @@ class IdaFeatureExtractor(StaticFeatureExtractor): def __init__(self): super().__init__( hashes=SampleHashes( - md5=ida_nalt.retrieve_input_file_md5(), sha1="(unknown)", sha256=ida_nalt.retrieve_input_file_sha256() + md5=capa.ida.helpers.retrieve_input_file_md5(), + sha1="(unknown)", + sha256=capa.ida.helpers.retrieve_input_file_sha256(), ) ) self.global_features: List[Tuple[Feature, Address]] = [] diff --git a/capa/features/extractors/ida/idalib.py b/capa/features/extractors/ida/idalib.py new file mode 100644 index 000000000..df1e3172e --- /dev/null +++ b/capa/features/extractors/ida/idalib.py @@ -0,0 +1,113 @@ +# Copyright (C) 2024 Mandiant, Inc. All Rights Reserved. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at: [package root]/LICENSE.txt +# Unless required by applicable law or agreed to in writing, software distributed under the License +# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and limitations under the License. +import os +import sys +import json +import logging +import importlib.util +from typing import Optional +from pathlib import Path + +logger = logging.getLogger(__name__) + + +def is_idalib_installed() -> bool: + try: + return importlib.util.find_spec("ida") is not None + except ModuleNotFoundError: + return False + + +def get_idalib_user_config_path() -> Optional[Path]: + """Get the path to the user's config file based on platform following IDA's user directories.""" + # derived from `py-activate-idalib.py` from IDA v9.0 Beta 4 + + if sys.platform == "win32": + # On Windows, use the %APPDATA%\Hex-Rays\IDA Pro directory + config_dir = Path(os.getenv("APPDATA")) / "Hex-Rays" / "IDA Pro" + else: + # On macOS and Linux, use ~/.idapro + config_dir = Path.home() / ".idapro" + + # Return the full path to the config file (now in JSON format) + user_config_path = config_dir / "ida-config.json" + if not user_config_path.exists(): + return None + return user_config_path + + +def find_idalib() -> Optional[Path]: + config_path = get_idalib_user_config_path() + if not config_path: + return None + + config = json.loads(config_path.read_text(encoding="utf-8")) + + try: + ida_install_dir = Path(config["Paths"]["ida-install-dir"]) + except KeyError: + return None + + if not ida_install_dir.exists(): + return None + + libname = { + "win32": "idalib.dll", + "linux": "libidalib.so", + "linux2": "libidalib.so", + "darwin": "libidalib.dylib", + }[sys.platform] + + if not (ida_install_dir / "ida.hlp").is_file(): + return None + + if not (ida_install_dir / libname).is_file(): + return None + + idalib_path = ida_install_dir / "idalib" / "python" + if not idalib_path.exists(): + return None + + if not (idalib_path / "ida" / "__init__.py").is_file(): + return None + + return idalib_path + + +def has_idalib() -> bool: + if is_idalib_installed(): + logger.debug("found installed IDA idalib API") + return True + + logger.debug("IDA idalib API not installed, searching...") + + idalib_path = find_idalib() + if not idalib_path: + logger.debug("failed to find IDA idalib installation") + + logger.debug("found IDA idalib API: %s", idalib_path) + return idalib_path is not None + + +def load_idalib() -> bool: + try: + import ida + + return True + except ImportError: + idalib_path = find_idalib() + if not idalib_path: + return False + + sys.path.append(idalib_path.absolute().as_posix()) + try: + import ida # noqa: F401 unused import + + return True + except ImportError: + return False diff --git a/capa/helpers.py b/capa/helpers.py index 237a67f62..1b36f89e4 100644 --- a/capa/helpers.py +++ b/capa/helpers.py @@ -5,11 +5,14 @@ # Unless required by applicable law or agreed to in writing, software distributed under the License # is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and limitations under the License. +import io import os import sys import gzip +import ctypes import inspect import logging +import tempfile import contextlib import importlib.util from typing import Dict, List, Union, BinaryIO, Iterator, NoReturn @@ -81,6 +84,59 @@ def assert_never(value) -> NoReturn: assert False, f"Unhandled value: {value} ({type(value).__name__})" # noqa: B011 +@contextlib.contextmanager +def stdout_redirector(stream): + """ + Redirect stdout at the C runtime level, + which lets us handle native libraries that spam stdout. + + *But*, this only works on Linux! Otherwise will silently still write to stdout. + So, try to upstream the fix when possible. + + Via: https://eli.thegreenplace.net/2015/redirecting-all-kinds-of-stdout-in-python/ + """ + if sys.platform not in ("linux", "linux2"): + logger.warning("Unable to capture STDOUT on non-Linux (begin)") + yield + logger.warning("Unable to capture STDOUT on non-Linux (end)") + return + + # libc is only on Linux + LIBC = ctypes.CDLL(None) + C_STDOUT = ctypes.c_void_p.in_dll(LIBC, "stdout") + + # The original fd stdout points to. Usually 1 on POSIX systems. + original_stdout_fd = sys.stdout.fileno() + + def _redirect_stdout(to_fd): + """Redirect stdout to the given file descriptor.""" + # Flush the C-level buffer stdout + LIBC.fflush(C_STDOUT) + # Flush and close sys.stdout - also closes the file descriptor (fd) + sys.stdout.close() + # Make original_stdout_fd point to the same file as to_fd + os.dup2(to_fd, original_stdout_fd) + # Create a new sys.stdout that points to the redirected fd + sys.stdout = io.TextIOWrapper(os.fdopen(original_stdout_fd, "wb")) + + # Save a copy of the original stdout fd in saved_stdout_fd + saved_stdout_fd = os.dup(original_stdout_fd) + try: + # Create a temporary file and redirect stdout to it + tfile = tempfile.TemporaryFile(mode="w+b") + _redirect_stdout(tfile.fileno()) + # Yield to caller, then redirect stdout back to the saved fd + yield + _redirect_stdout(saved_stdout_fd) + # Copy contents of temporary file to the given stream + tfile.flush() + tfile.seek(0, io.SEEK_SET) + stream.write(tfile.read()) + finally: + tfile.close() + os.close(saved_stdout_fd) + + def load_json_from_path(json_path: Path): with gzip.open(json_path, "r") as compressed_report: try: diff --git a/capa/ida/helpers.py b/capa/ida/helpers.py index 547099f47..91f29f05e 100644 --- a/capa/ida/helpers.py +++ b/capa/ida/helpers.py @@ -14,6 +14,7 @@ import idc import idaapi import ida_ida +import ida_nalt import idautils import ida_bytes import ida_loader @@ -64,6 +65,12 @@ def is_64bit() -> bool: info: idaapi.idainfo = idaapi.get_inf_structure() return info.is_64bit() + def retrieve_input_file_md5() -> str: + return ida_nalt.retrieve_input_file_md5() + + def retrieve_input_file_sha256() -> str: + return ida_nalt.retrieve_input_file_sha256() + else: def get_filetype() -> "ida_ida.filetype_t": @@ -78,6 +85,12 @@ def is_32bit() -> bool: def is_64bit() -> bool: return idaapi.inf_is_64bit() + def retrieve_input_file_md5() -> str: + return ida_nalt.retrieve_input_file_md5().hex() + + def retrieve_input_file_sha256() -> str: + return ida_nalt.retrieve_input_file_sha256().hex() + def inform_user_ida_ui(message): # this isn't a logger, this is IDA's logging facility diff --git a/capa/loader.py b/capa/loader.py index 818198710..603f72533 100644 --- a/capa/loader.py +++ b/capa/loader.py @@ -5,6 +5,7 @@ # Unless required by applicable law or agreed to in writing, software distributed under the License # is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and limitations under the License. +import io import os import sys import logging @@ -69,6 +70,7 @@ BACKEND_VMRAY = "vmray" BACKEND_FREEZE = "freeze" BACKEND_BINEXPORT2 = "binexport2" +BACKEND_IDA = "ida" class CorruptFile(ValueError): @@ -321,6 +323,36 @@ def get_extractor( return capa.features.extractors.binexport2.extractor.BinExport2FeatureExtractor(be2, buf) + elif backend == BACKEND_IDA: + import capa.features.extractors.ida.idalib as idalib + + if not idalib.has_idalib(): + raise RuntimeError( + "cannot find IDA idalib module." + ) + + if not idalib.load_idalib(): + raise RuntimeError("failed to load IDA idalib module.") + + import ida + import ida_auto + + import capa.features.extractors.ida.extractor + + logger.debug("idalib: opening database...") + # idalib writes to stdout (ugh), so we have to capture that + # so as not to screw up structured output. + with capa.helpers.stdout_redirector(io.BytesIO()): + with console.status("analyzing program...", spinner="dots"): + if ida.open_database(str(input_path), run_auto_analysis=True): + raise RuntimeError("failed to analyze input file") + + logger.debug("idalib: waiting for analysis...") + ida_auto.auto_wait() + logger.debug("idalib: opened database.") + + return capa.features.extractors.ida.extractor.IdaFeatureExtractor() + else: raise ValueError("unexpected backend: " + backend) diff --git a/capa/main.py b/capa/main.py index 8035eafa2..d7b45e03a 100644 --- a/capa/main.py +++ b/capa/main.py @@ -43,6 +43,7 @@ from capa.rules import RuleSet from capa.engine import MatchResults from capa.loader import ( + BACKEND_IDA, BACKEND_VIV, BACKEND_CAPE, BACKEND_BINJA, @@ -283,6 +284,7 @@ def install_common_args(parser, wanted=None): backends = [ (BACKEND_AUTO, "(default) detect appropriate backend automatically"), (BACKEND_VIV, "vivisect"), + (BACKEND_IDA, "IDA via idalib"), (BACKEND_PEFILE, "pefile (file features only)"), (BACKEND_BINJA, "Binary Ninja"), (BACKEND_DOTNET, ".NET"), diff --git a/pyproject.toml b/pyproject.toml index e0954c17d..8a375602e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -185,6 +185,7 @@ known_first_party = [ "ghidra", "ida", "ida_ida", + "ida_auto", "ida_bytes", "ida_entry", "ida_funcs", diff --git a/scripts/detect-backends.py b/scripts/detect-backends.py index 8706ffe6a..5d294992f 100644 --- a/scripts/detect-backends.py +++ b/scripts/detect-backends.py @@ -1,6 +1,13 @@ +# Copyright (C) 2024 Mandiant, Inc. All Rights Reserved. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at: [package root]/LICENSE.txt +# Unless required by applicable law or agreed to in writing, software distributed under the License +# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and limitations under the License. + import os import sys -import json import logging import importlib.util from typing import Optional @@ -9,6 +16,8 @@ import rich import rich.table +from capa.features.extractors.ida.idalib import find_idalib, load_idalib, is_idalib_installed + logger = logging.getLogger(__name__) @@ -143,103 +152,6 @@ def load_vivisect() -> bool: return False -def is_idalib_installed() -> bool: - try: - return importlib.util.find_spec("ida") is not None - except ModuleNotFoundError: - return False - - -def get_idalib_user_config_path() -> Optional[Path]: - """Get the path to the user's config file based on platform following IDA's user directories.""" - # derived from `py-activate-idalib.py` from IDA v9.0 Beta 4 - - if sys.platform == "win32": - # On Windows, use the %APPDATA%\Hex-Rays\IDA Pro directory - config_dir = Path(os.getenv("APPDATA")) / "Hex-Rays" / "IDA Pro" - else: - # On macOS and Linux, use ~/.idapro - config_dir = Path.home() / ".idapro" - - # Return the full path to the config file (now in JSON format) - user_config_path = config_dir / "ida-config.json" - if not user_config_path.exists(): - return None - return user_config_path - - -def find_idalib() -> Optional[Path]: - config_path = get_idalib_user_config_path() - if not config_path: - return None - - config = json.loads(config_path.read_text(encoding="utf-8")) - - try: - ida_install_dir = Path(config["Paths"]["ida-install-dir"]) - except KeyError: - return None - - if not ida_install_dir.exists(): - return None - - libname = { - "win32": "idalib.dll", - "linux": "libidalib.so", - "linux2": "libidalib.so", - "darwin": "libidalib.dylib", - }[sys.platform] - - if not (ida_install_dir / "ida.hlp").is_file(): - return None - - if not (ida_install_dir / libname).is_file(): - return None - - idalib_path = ida_install_dir / "idalib" / "python" - if not idalib_path.exists(): - return None - - if not (idalib_path / "ida" / "__init__.py").is_file(): - return None - - return idalib_path - - -def has_idalib() -> bool: - if is_idalib_installed(): - logger.debug("found installed IDA idalib API") - return True - - logger.debug("IDA idalib API not installed, searching...") - - idalib_path = find_idalib() - if not idalib_path: - logger.debug("failed to find IDA idalib installation") - - logger.debug("found IDA idalib API: %s", idalib_path) - return idalib_path is not None - - -def load_idalib() -> bool: - try: - import ida - - return True - except ImportError: - idalib_path = find_idalib() - if not idalib_path: - return False - - sys.path.append(idalib_path.absolute().as_posix()) - try: - import ida # noqa: F401 unused import - - return True - except ImportError: - return False - - def main(): logging.basicConfig(level=logging.INFO)