From 3141e940de1ce2ca7564e10bfce6a1cda40d66f6 Mon Sep 17 00:00:00 2001
From: r-sm2024 <moonsunghyun96@gmail.com>
Date: Mon, 10 Jun 2024 21:13:16 +0000
Subject: [PATCH 001/105] Add vmray text to JSON parser.

---
 vmray_parser.py | 138 ++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 138 insertions(+)
 create mode 100644 vmray_parser.py

diff --git a/vmray_parser.py b/vmray_parser.py
new file mode 100644
index 000000000..cd52e0d8e
--- /dev/null
+++ b/vmray_parser.py
@@ -0,0 +1,138 @@
+import re
+import json
+import argparse
+
+from datetime import datetime
+
+class VMrayParser:
+
+    def read_vmray_log(self):
+        with open(self.filename, 'r') as f:
+            lines = f.readlines()
+        return lines
+
+    def __init__(self, filename, output_filename):
+        self.filename = filename
+        self.output_filename = output_filename
+        self.data = {}
+        self.processes = []  
+        self.current_process = None  
+        self.threads = []  
+
+    #Parse info section of VMray output
+    def parse_info(self, lines):
+        info_data = {}
+        for line in lines:
+            if line.startswith("# Analyzer Version:"):
+                info_data["analyzer_version"] = int(line.split(":")[1].strip().replace(".", ""))
+            elif line.startswith("# Analyzer Build Date:" ):
+                info_data["analyzer_build_date"] = datetime.strptime(line.split(":",1)[1].strip(),"%b %d %Y %H:%M:%S").isoformat()
+            elif line.startswith("# Log Creation Date:"):
+                info_data["log_create_date"] = datetime.strptime(line.split(":",1)[1].strip(), "%d.%m.%Y %H:%M:%S.%f").isoformat()
+        self.data["info"] = info_data
+
+    #Parse process data 
+    def parse_process(self, lines):
+
+        process_data = {}
+        
+
+        for line in lines:
+
+            #Match key:value format for the process section
+            ####Maybe since the process section puts ints in quotations, we can filter by that? Thread section doesn't.
+            
+            matches = re.findall(r"\s+(.+?) = \"(.*?)\"", line) #old r"\s+(.+?) = (.*)"
+            
+            
+            for match in matches:
+                key = match[0]
+                
+                if match[1]:
+                    value = match[1]
+                elif match[2]:
+                    value = match[2]
+
+                process_data[key.strip()] = value.strip()
+            
+
+        self.processes.append(process_data)  # Append to the list of processes
+    
+
+    def parse_thread(self, lines):
+        thread_data = {}
+        thread_calls = []
+        current_thread_id = None
+
+        #Start parsing thread section for id, os_id, and api calls
+
+        for line in lines:
+            if line.startswith("\tid ="):
+                    current_thread_id = int(line.split("=")[1].strip().strip('"'))
+                    thread_data["id"] = current_thread_id
+
+            elif line.startswith("\tos_tid ="):
+                    thread_data["os_tid"] = line.split("=")[1].strip()
+
+            elif current_thread_id is not None and line.startswith("\t["):
+                #Check if line contains timestamp bracket 
+            
+            
+                    thread_calls.append(line.strip())
+
+                      # Append call_data to the list
+                
+
+        # Assign the call_data dictionary with the thread_calls list?
+        thread_data["calls"] = thread_calls 
+        
+        # Append thread_data to the list of threads
+        self.threads.append(thread_data) 
+        return thread_data
+        
+    def write_json_file(self):
+                
+        self.data["process"] = self.processes  # Add the list of processes to the main dictionary
+        self.data["threads"] = self.threads  # Add the list of threads to the main dictionary
+        with open(self.output_filename, 'w') as file:
+                    json.dump(self.data, file, indent=4)
+
+    def convert(self):
+        lines = self.read_vmray_log()
+        self.parse_info(lines)
+
+        self.current_process = None  # Set current_process to None at the start of convert
+        current_section = None
+        current_section_lines = []
+        for line in lines:
+            if line.startswith("Process:"):
+                current_section = "process"
+                # Parse the process data immediately
+                self.parse_process(current_section_lines)  # Parse process data when encountering "Process"
+                current_section_lines = [line]
+            elif line.startswith("Thread:"):
+                current_section = "thread"
+                if current_section_lines:
+                    self.parse_thread(current_section_lines)  # Parse thread when encountering "Thread"
+                current_section_lines = [line]
+            else:
+                current_section_lines.append(line)
+
+        if current_section_lines:
+            if current_section == "process":
+                self.parse_process(current_section_lines)
+            elif current_section == "thread":
+                self.parse_thread(current_section_lines)
+        self.write_json_file()
+        print(json.dumps(self.data, indent=4)) 
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Convert VMray log files to JSON.")
+    parser.add_argument("input_file", help="The path to the VMray log file")
+    parser.add_argument("-o", "--output_file", default="vmray_output.json", help="The path to the output JSON file")
+
+    args = parser.parse_args()
+
+    vmray_parser = VMrayParser(args.input_file, args.output_file)
+    vmray_parser.convert()
+    print(f"Your VMray flog file '{args.input_file}' was converted to JSON and saved to '{args.output_file}'.")
\ No newline at end of file

From a9dafe283c8ad7982d7f4ff34842b772c3b0d25e Mon Sep 17 00:00:00 2001
From: mr-tz <moritz.raabe@mandiant.com>
Date: Thu, 13 Jun 2024 16:37:45 +0000
Subject: [PATCH 002/105] example using pydantic-xml to parse flog.xml

---
 capa/features/extractors/vmray/__init__.py  |  0
 capa/features/extractors/vmray/extractor.py | 28 ++++++++++++++
 capa/features/extractors/vmray/models.py    | 43 +++++++++++++++++++++
 3 files changed, 71 insertions(+)
 create mode 100644 capa/features/extractors/vmray/__init__.py
 create mode 100644 capa/features/extractors/vmray/extractor.py
 create mode 100644 capa/features/extractors/vmray/models.py

diff --git a/capa/features/extractors/vmray/__init__.py b/capa/features/extractors/vmray/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/capa/features/extractors/vmray/extractor.py b/capa/features/extractors/vmray/extractor.py
new file mode 100644
index 000000000..7a3565a3e
--- /dev/null
+++ b/capa/features/extractors/vmray/extractor.py
@@ -0,0 +1,28 @@
+from typing import Dict
+from pathlib import Path
+
+import pydantic_xml
+
+from capa.features.extractors.vmray.models import Analysis
+from capa.features.extractors.base_extractor import SampleHashes, DynamicFeatureExtractor
+
+# TODO also/or look into xmltodict?
+
+
+class VMRayExtractor(DynamicFeatureExtractor):
+    def __init__(self, report: Path): ...
+
+    @classmethod
+    def from_report(cls, report: Path) -> "VMRayExtractor":
+        print(report.read_text()[:200])
+
+        vr = Analysis.from_xml(report.read_text())
+
+        print(vr)
+
+
+if __name__ == "__main__":
+    import sys
+
+    input_path = Path(sys.argv[1])
+    VMRayExtractor.from_report(input_path)
diff --git a/capa/features/extractors/vmray/models.py b/capa/features/extractors/vmray/models.py
new file mode 100644
index 000000000..e8036bed4
--- /dev/null
+++ b/capa/features/extractors/vmray/models.py
@@ -0,0 +1,43 @@
+# Copyright (C) 2024 Mandiant, Inc. All Rights Reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at: [package root]/LICENSE.txt
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+#  is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and limitations under the License.
+from typing import Any, Dict, List, Union, Literal, Optional
+
+# TODO install/force lxml?
+from pydantic_xml import BaseXmlModel, attr, element
+
+
+class FunctionCall(BaseXmlModel, tag="fncall"):
+    # ts: str = attr()
+    # fncall_id: int = attr()
+    # process_id: int = attr()
+    name: str = attr()
+    # in_: element(name="in")
+    # out: element()
+
+
+class MonitorProcess(BaseXmlModel, tag="monitor_process"):
+    ts: str = attr()
+    process_id: int = attr()
+    image_name: str = attr()
+
+
+class MonitorThread(BaseXmlModel, tag="monitor_thread"):
+    ts: str = attr()
+    thread_id: int = attr()
+    process_id: int = attr()
+    os_tid: str = attr()  # TODO hex
+
+
+class Analysis(BaseXmlModel, tag="analysis"):
+    log_version: str = attr()
+    analyzer_version: str = attr()
+    analysis_date: str = attr()
+    processes: List[MonitorProcess] = element(tag="monitor_process")
+    threads: List[MonitorThread] = element(tag="monitor_thread")
+    # failing so far...
+    # fncall: List[FunctionCall] = element(tag="fncall")

From a797405648434849fd6d44f984ba979f1bea324f Mon Sep 17 00:00:00 2001
From: Mike Hunhoff <mike.hunhoff@gmail.com>
Date: Thu, 13 Jun 2024 12:54:59 -0600
Subject: [PATCH 003/105] vmray: add example models for summary_v2.json

---
 capa/features/extractors/vmray/extractor.py | 37 ++++++++-
 capa/features/extractors/vmray/models.py    | 92 +++++++++++++++++++++
 2 files changed, 128 insertions(+), 1 deletion(-)

diff --git a/capa/features/extractors/vmray/extractor.py b/capa/features/extractors/vmray/extractor.py
index 7a3565a3e..916f6e286 100644
--- a/capa/features/extractors/vmray/extractor.py
+++ b/capa/features/extractors/vmray/extractor.py
@@ -3,7 +3,8 @@
 
 import pydantic_xml
 
-from capa.features.extractors.vmray.models import Analysis
+import capa.helpers
+from capa.features.extractors.vmray.models import Analysis, SummaryV2
 from capa.features.extractors.base_extractor import SampleHashes, DynamicFeatureExtractor
 
 # TODO also/or look into xmltodict?
@@ -20,9 +21,43 @@ def from_report(cls, report: Path) -> "VMRayExtractor":
 
         print(vr)
 
+    @classmethod
+    def from_summary(cls, sv2_path: Path):
+        sv2_json = capa.helpers.load_json_from_path(sv2_path)
+        sv2 = SummaryV2.model_validate(sv2_json)
+
+        for k, v in sv2.files.items():
+            if not v.is_sample:
+                continue
+
+            if not v.ref_static_data:
+                continue
+
+            static_data = sv2.static_data.get(v.ref_static_data.path[1])
+
+            print(f"file_type: {static_data.pe.basic_info.file_type}")
+            print(f"image_base: {hex(static_data.pe.basic_info.image_base)}")
+            print(f"machine_type: {static_data.pe.basic_info.machine_type}")
+
+            if not static_data.pe:
+                continue
+
+            pe = static_data.pe
+
+            if pe.exports:
+                print("exports")
+                for export in pe.exports:
+                    print(f"\tname: {export.api.name}, address: {hex(export.address)}")
+
+            if pe.imports:
+                print("imports")
+                for import_ in pe.imports:
+                    print(f"\tdll: {import_.dll} ({len(import_.apis)})")
 
 if __name__ == "__main__":
     import sys
 
     input_path = Path(sys.argv[1])
+
     VMRayExtractor.from_report(input_path)
+    # VMRayExtractor.from_summary(input_path)
diff --git a/capa/features/extractors/vmray/models.py b/capa/features/extractors/vmray/models.py
index e8036bed4..d3b4a74c3 100644
--- a/capa/features/extractors/vmray/models.py
+++ b/capa/features/extractors/vmray/models.py
@@ -7,10 +7,15 @@
 # See the License for the specific language governing permissions and limitations under the License.
 from typing import Any, Dict, List, Union, Literal, Optional
 
+from pydantic import BaseModel
+
 # TODO install/force lxml?
 from pydantic_xml import BaseXmlModel, attr, element
 
 
+### models for flog.xml
+
+
 class FunctionCall(BaseXmlModel, tag="fncall"):
     # ts: str = attr()
     # fncall_id: int = attr()
@@ -41,3 +46,90 @@ class Analysis(BaseXmlModel, tag="analysis"):
     threads: List[MonitorThread] = element(tag="monitor_thread")
     # failing so far...
     # fncall: List[FunctionCall] = element(tag="fncall")
+
+
+### models for summary_v2.json files
+
+
+class GenericReference(BaseModel):
+    _type: str
+    path: List[str]
+    source: str
+
+
+class StaticDataReference(GenericReference): ...
+
+
+class PEFileBasicInfo(BaseModel):
+    _type: str
+    compile_time: str
+    file_type: str
+    image_base: int
+    machine_type: str
+    size_of_code: int
+    size_of_initialized_data: int
+    size_of_uninitialized_data: int
+    subsystem: str
+    entry_point: int
+    imphash: Optional[str] = None
+
+
+class API(BaseModel):
+    _type: str
+    name: str
+    ordinal: Optional[int] = None
+
+
+class PEFileExport(BaseModel):
+    _type: str
+    address: int
+    api: API
+
+
+class PEFileImport(BaseModel):
+    _type: str
+    address: int
+    api: API
+    thunk_offset: int
+    hint: Optional[int] = None
+    thunk_rva: int
+
+
+class PEFileImportModule(BaseModel):
+    _type: str
+    dll: str
+    apis: List[PEFileImport]
+
+
+class PEFile(BaseModel):
+    _type: str
+    basic_info: Optional[PEFileBasicInfo] = None
+    exports: Optional[List[PEFileExport]] = None
+    imports: Optional[List[PEFileImportModule]] = None
+
+
+class StaticData(BaseModel):
+    pe: Optional[PEFile] = None
+
+
+class File(BaseModel):
+    _type: str
+    categories: List[str]
+    hash_values: Dict[str, str]
+    is_artifact: bool
+    is_ioc: bool
+    is_sample: bool
+    size: int
+    is_truncated: bool
+    mime_type: Optional[str] = None
+    operations: Optional[List[str]] = None
+    ref_filenames: Optional[List[GenericReference]] = None
+    ref_gfncalls: Optional[List[GenericReference]] = None
+    ref_static_data: Optional[StaticDataReference] = None
+    ref_vti_matches: Optional[List[GenericReference]] = None
+    verdict: str
+
+
+class SummaryV2(BaseModel):
+    files: Dict[str, File]
+    static_data: Dict[str, StaticData]

From ca02b4ac7c491f9fe8509efc8f4d5913e13a6691 Mon Sep 17 00:00:00 2001
From: Mike Hunhoff <mike.hunhoff@gmail.com>
Date: Thu, 13 Jun 2024 14:12:41 -0600
Subject: [PATCH 004/105] vmray: expand extractor to emit file export features

---
 capa/features/extractors/vmray/__init__.py  | 47 ++++++++++++++
 capa/features/extractors/vmray/extractor.py | 69 +++++++++------------
 capa/features/extractors/vmray/file.py      | 36 +++++++++++
 capa/features/extractors/vmray/models.py    |  3 +-
 4 files changed, 113 insertions(+), 42 deletions(-)
 create mode 100644 capa/features/extractors/vmray/file.py

diff --git a/capa/features/extractors/vmray/__init__.py b/capa/features/extractors/vmray/__init__.py
index e69de29bb..4dc4f59ad 100644
--- a/capa/features/extractors/vmray/__init__.py
+++ b/capa/features/extractors/vmray/__init__.py
@@ -0,0 +1,47 @@
+# Copyright (C) 2024 Mandiant, Inc. All Rights Reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at: [package root]/LICENSE.txt
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+#  is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and limitations under the License.
+from typing import Dict
+
+
+class VMRayAnalysis:
+    def __init__(self, sv2, flog):
+        self.sv2 = sv2  # logs/summary_v2.json
+        self.flog = flog  # logs/flog.xml
+        self.exports: Dict[int, str] = {}
+        self.imports: Dict[int, str] = {}
+
+        self.sample_file_name: str
+        self.sample_file_analysis = None
+        self.sample_file_static_data = None
+
+        self._find_sample_file()
+        self._compute_exports()
+
+    def _find_sample_file(self):
+        for k, v in self.sv2.files.items():
+            if v.is_sample:
+                self.sample_file_name = k
+                self.sample_file_analysis = v
+
+                if v.ref_static_data:
+                    self.sample_file_static_data = self.sv2.static_data.get(v.ref_static_data.path[1])
+
+                break
+
+    def _compute_exports(self):
+        if not self.sample_file_static_data:
+            return
+
+        if not self.sample_file_static_data.pe:
+            return
+
+        pe = self.sample_file_static_data.pe
+
+        if pe.exports:
+            for export in pe.exports:
+                self.exports[export.address] = export.api.name
diff --git a/capa/features/extractors/vmray/extractor.py b/capa/features/extractors/vmray/extractor.py
index 916f6e286..08c9abdee 100644
--- a/capa/features/extractors/vmray/extractor.py
+++ b/capa/features/extractors/vmray/extractor.py
@@ -1,63 +1,52 @@
-from typing import Dict
+# Copyright (C) 2024 Mandiant, Inc. All Rights Reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at: [package root]/LICENSE.txt
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+#  is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and limitations under the License.
+
+import json
+from typing import Tuple, Iterator
 from pathlib import Path
-
-import pydantic_xml
+from zipfile import ZipFile
 
 import capa.helpers
+import capa.features.extractors.vmray.file
+from capa.features.common import Feature
+from capa.features.address import Address
+from capa.features.extractors.vmray import VMRayAnalysis
 from capa.features.extractors.vmray.models import Analysis, SummaryV2
-from capa.features.extractors.base_extractor import SampleHashes, DynamicFeatureExtractor
+from capa.features.extractors.base_extractor import DynamicFeatureExtractor
 
 # TODO also/or look into xmltodict?
 
 
 class VMRayExtractor(DynamicFeatureExtractor):
-    def __init__(self, report: Path): ...
+    def __init__(self, analysis):
+        self.analysis = analysis
 
     @classmethod
-    def from_report(cls, report: Path) -> "VMRayExtractor":
-        print(report.read_text()[:200])
-
-        vr = Analysis.from_xml(report.read_text())
+    def from_archive(cls, archive_path: Path):
+        archive = ZipFile(archive_path, "r")
 
-        print(vr)
-
-    @classmethod
-    def from_summary(cls, sv2_path: Path):
-        sv2_json = capa.helpers.load_json_from_path(sv2_path)
+        sv2_json = json.loads(archive.read("logs/summary_v2.json", pwd=b"infected"))
         sv2 = SummaryV2.model_validate(sv2_json)
 
-        for k, v in sv2.files.items():
-            if not v.is_sample:
-                continue
-
-            if not v.ref_static_data:
-                continue
-
-            static_data = sv2.static_data.get(v.ref_static_data.path[1])
-
-            print(f"file_type: {static_data.pe.basic_info.file_type}")
-            print(f"image_base: {hex(static_data.pe.basic_info.image_base)}")
-            print(f"machine_type: {static_data.pe.basic_info.machine_type}")
-
-            if not static_data.pe:
-                continue
+        flog_xml = archive.read("logs/flog.xml", pwd=b"infected")
+        flog = Analysis.from_xml(flog_xml)
 
-            pe = static_data.pe
+        return cls(VMRayAnalysis(sv2, flog))
 
-            if pe.exports:
-                print("exports")
-                for export in pe.exports:
-                    print(f"\tname: {export.api.name}, address: {hex(export.address)}")
+    def extract_file_features(self) -> Iterator[Tuple[Feature, Address]]:
+        yield from capa.features.extractors.vmray.file.extract_features(self.analysis)
 
-            if pe.imports:
-                print("imports")
-                for import_ in pe.imports:
-                    print(f"\tdll: {import_.dll} ({len(import_.apis)})")
 
 if __name__ == "__main__":
     import sys
 
     input_path = Path(sys.argv[1])
 
-    VMRayExtractor.from_report(input_path)
-    # VMRayExtractor.from_summary(input_path)
+    extractor = VMRayExtractor.from_archive(input_path)
+    for feat, addr in extractor.extract_file_features():
+        print(f"{feat} -> {addr}")
diff --git a/capa/features/extractors/vmray/file.py b/capa/features/extractors/vmray/file.py
new file mode 100644
index 000000000..48af3d393
--- /dev/null
+++ b/capa/features/extractors/vmray/file.py
@@ -0,0 +1,36 @@
+# Copyright (C) 2024 Mandiant, Inc. All Rights Reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at: [package root]/LICENSE.txt
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+#  is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and limitations under the License.
+
+import logging
+from typing import Tuple, Iterator
+
+from capa.features.file import Export
+from capa.features.common import Feature
+from capa.features.address import Address, AbsoluteVirtualAddress
+from capa.features.extractors.vmray import VMRayAnalysis
+
+logger = logging.getLogger(__name__)
+
+
+def extract_export_names(analysis: VMRayAnalysis) -> Iterator[Tuple[Feature, Address]]:
+    for addr, name in analysis.exports.items():
+        yield Export(name), AbsoluteVirtualAddress(addr)
+
+
+def extract_features(analysis: VMRayAnalysis) -> Iterator[Tuple[Feature, Address]]:
+    for handler in FILE_HANDLERS:
+        for feature, addr in handler(analysis):
+            yield feature, addr
+
+
+FILE_HANDLERS = (
+    # extract_import_names,
+    extract_export_names,
+    # extract_section_names,
+    # extract_file_strings,
+)
diff --git a/capa/features/extractors/vmray/models.py b/capa/features/extractors/vmray/models.py
index d3b4a74c3..3c627f334 100644
--- a/capa/features/extractors/vmray/models.py
+++ b/capa/features/extractors/vmray/models.py
@@ -5,14 +5,13 @@
 # Unless required by applicable law or agreed to in writing, software distributed under the License
 #  is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and limitations under the License.
-from typing import Any, Dict, List, Union, Literal, Optional
+from typing import Dict, List, Optional
 
 from pydantic import BaseModel
 
 # TODO install/force lxml?
 from pydantic_xml import BaseXmlModel, attr, element
 
-
 ### models for flog.xml
 
 

From 970b184651a9ac8a38e6e24beef1d479b12c82d5 Mon Sep 17 00:00:00 2001
From: Mike Hunhoff <mike.hunhoff@gmail.com>
Date: Thu, 13 Jun 2024 14:20:11 -0600
Subject: [PATCH 005/105] vmray: add stubs for file imports

---
 capa/features/extractors/vmray/__init__.py | 4 ++++
 capa/features/extractors/vmray/file.py     | 8 ++++++--
 2 files changed, 10 insertions(+), 2 deletions(-)

diff --git a/capa/features/extractors/vmray/__init__.py b/capa/features/extractors/vmray/__init__.py
index 4dc4f59ad..4472e86a3 100644
--- a/capa/features/extractors/vmray/__init__.py
+++ b/capa/features/extractors/vmray/__init__.py
@@ -45,3 +45,7 @@ def _compute_exports(self):
         if pe.exports:
             for export in pe.exports:
                 self.exports[export.address] = export.api.name
+
+    def _compute_imports(self):
+        # TODO (meh)
+        ...
diff --git a/capa/features/extractors/vmray/file.py b/capa/features/extractors/vmray/file.py
index 48af3d393..81c150f27 100644
--- a/capa/features/extractors/vmray/file.py
+++ b/capa/features/extractors/vmray/file.py
@@ -5,7 +5,6 @@
 # Unless required by applicable law or agreed to in writing, software distributed under the License
 #  is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and limitations under the License.
-
 import logging
 from typing import Tuple, Iterator
 
@@ -22,6 +21,11 @@ def extract_export_names(analysis: VMRayAnalysis) -> Iterator[Tuple[Feature, Add
         yield Export(name), AbsoluteVirtualAddress(addr)
 
 
+def extract_import_names(analysis: VMRayAnalysis) -> Iterator[Tuple[Feature, Address]]:
+    # TODO (meh)
+    yield from []
+
+
 def extract_features(analysis: VMRayAnalysis) -> Iterator[Tuple[Feature, Address]]:
     for handler in FILE_HANDLERS:
         for feature, addr in handler(analysis):
@@ -29,7 +33,7 @@ def extract_features(analysis: VMRayAnalysis) -> Iterator[Tuple[Feature, Address
 
 
 FILE_HANDLERS = (
-    # extract_import_names,
+    extract_import_names,
     extract_export_names,
     # extract_section_names,
     # extract_file_strings,

From 7d0ac71353b17d72d8a22f68daf8973ae35190d3 Mon Sep 17 00:00:00 2001
From: Mike Hunhoff <mike.hunhoff@gmail.com>
Date: Thu, 13 Jun 2024 16:31:12 -0600
Subject: [PATCH 006/105] vmray: cleanup pydantic models and implement file
 section extraction

---
 capa/features/extractors/vmray/__init__.py | 42 +++++++++++-----------
 capa/features/extractors/vmray/file.py     |  9 +++--
 capa/features/extractors/vmray/models.py   | 24 +++++++++----
 3 files changed, 47 insertions(+), 28 deletions(-)

diff --git a/capa/features/extractors/vmray/__init__.py b/capa/features/extractors/vmray/__init__.py
index 4472e86a3..662153603 100644
--- a/capa/features/extractors/vmray/__init__.py
+++ b/capa/features/extractors/vmray/__init__.py
@@ -5,47 +5,49 @@
 # Unless required by applicable law or agreed to in writing, software distributed under the License
 #  is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and limitations under the License.
-from typing import Dict
+from typing import Dict, Optional
+
+from capa.features.extractors.vmray.models import File, Analysis, SummaryV2, StaticData
 
 
 class VMRayAnalysis:
-    def __init__(self, sv2, flog):
+    def __init__(self, sv2: SummaryV2, flog: Analysis):
         self.sv2 = sv2  # logs/summary_v2.json
         self.flog = flog  # logs/flog.xml
         self.exports: Dict[int, str] = {}
         self.imports: Dict[int, str] = {}
+        self.sections: Dict[int, str] = {}
 
         self.sample_file_name: str
-        self.sample_file_analysis = None
-        self.sample_file_static_data = None
+        self.sample_file_analysis: File
+        self.sample_file_static_data: Optional[StaticData]
 
         self._find_sample_file()
         self._compute_exports()
+        self._compute_sections()
 
     def _find_sample_file(self):
-        for k, v in self.sv2.files.items():
-            if v.is_sample:
-                self.sample_file_name = k
-                self.sample_file_analysis = v
+        for file_name, file_analysis in self.sv2.files.items():
+            if file_analysis.is_sample:
+                # this indicates the sample submitted for analysis??
+                self.sample_file_name = file_name
+                self.sample_file_analysis = file_analysis
 
-                if v.ref_static_data:
-                    self.sample_file_static_data = self.sv2.static_data.get(v.ref_static_data.path[1])
+                if file_analysis.ref_static_data:
+                    self.sample_file_static_data = self.sv2.static_data.get(file_analysis.ref_static_data.path[1])
 
                 break
 
     def _compute_exports(self):
-        if not self.sample_file_static_data:
-            return
-
-        if not self.sample_file_static_data.pe:
-            return
-
-        pe = self.sample_file_static_data.pe
-
-        if pe.exports:
-            for export in pe.exports:
+        if self.sample_file_static_data and self.sample_file_static_data.pe:
+            for export in self.sample_file_static_data.pe.exports:
                 self.exports[export.address] = export.api.name
 
     def _compute_imports(self):
         # TODO (meh)
         ...
+
+    def _compute_sections(self):
+        if self.sample_file_static_data and self.sample_file_static_data.pe:
+            for section in self.sample_file_static_data.pe.sections:
+                self.sections[section.virtual_address] = section.name
diff --git a/capa/features/extractors/vmray/file.py b/capa/features/extractors/vmray/file.py
index 81c150f27..da4c39539 100644
--- a/capa/features/extractors/vmray/file.py
+++ b/capa/features/extractors/vmray/file.py
@@ -8,7 +8,7 @@
 import logging
 from typing import Tuple, Iterator
 
-from capa.features.file import Export
+from capa.features.file import Export, Section
 from capa.features.common import Feature
 from capa.features.address import Address, AbsoluteVirtualAddress
 from capa.features.extractors.vmray import VMRayAnalysis
@@ -26,6 +26,11 @@ def extract_import_names(analysis: VMRayAnalysis) -> Iterator[Tuple[Feature, Add
     yield from []
 
 
+def extract_section_names(analysis: VMRayAnalysis) -> Iterator[Tuple[Feature, Address]]:
+    for addr, name in analysis.sections.items():
+        yield Section(name), AbsoluteVirtualAddress(addr)
+
+
 def extract_features(analysis: VMRayAnalysis) -> Iterator[Tuple[Feature, Address]]:
     for handler in FILE_HANDLERS:
         for feature, addr in handler(analysis):
@@ -35,6 +40,6 @@ def extract_features(analysis: VMRayAnalysis) -> Iterator[Tuple[Feature, Address
 FILE_HANDLERS = (
     extract_import_names,
     extract_export_names,
-    # extract_section_names,
+    extract_section_names,
     # extract_file_strings,
 )
diff --git a/capa/features/extractors/vmray/models.py b/capa/features/extractors/vmray/models.py
index 3c627f334..2a047fe01 100644
--- a/capa/features/extractors/vmray/models.py
+++ b/capa/features/extractors/vmray/models.py
@@ -100,11 +100,23 @@ class PEFileImportModule(BaseModel):
     apis: List[PEFileImport]
 
 
+class PEFileSection(BaseModel):
+    _type: str
+    entropy: float
+    flags: List[str] = []
+    name: str
+    raw_data_offset: int
+    raw_data_size: int
+    virtual_address: int
+    virtual_size: int
+
+
 class PEFile(BaseModel):
     _type: str
     basic_info: Optional[PEFileBasicInfo] = None
-    exports: Optional[List[PEFileExport]] = None
-    imports: Optional[List[PEFileImportModule]] = None
+    exports: List[PEFileExport] = []
+    imports: List[PEFileImportModule] = []
+    sections: List[PEFileSection] = []
 
 
 class StaticData(BaseModel):
@@ -121,11 +133,11 @@ class File(BaseModel):
     size: int
     is_truncated: bool
     mime_type: Optional[str] = None
-    operations: Optional[List[str]] = None
-    ref_filenames: Optional[List[GenericReference]] = None
-    ref_gfncalls: Optional[List[GenericReference]] = None
+    operations: List[str] = []
+    ref_filenames: List[GenericReference] = []
+    ref_gfncalls: List[GenericReference] = []
     ref_static_data: Optional[StaticDataReference] = None
-    ref_vti_matches: Optional[List[GenericReference]] = None
+    ref_vti_matches: List[GenericReference] = []
     verdict: str
 
 

From 8d3f032434e67d8dc90c925332af5204a8f97015 Mon Sep 17 00:00:00 2001
From: Mike Hunhoff <mike.hunhoff@gmail.com>
Date: Thu, 13 Jun 2024 16:43:23 -0600
Subject: [PATCH 007/105] vmray: clean up pydantic models and implement base
 address extraction

---
 capa/features/extractors/vmray/__init__.py  | 6 ++++++
 capa/features/extractors/vmray/extractor.py | 8 +++++++-
 capa/features/extractors/vmray/models.py    | 2 +-
 3 files changed, 14 insertions(+), 2 deletions(-)

diff --git a/capa/features/extractors/vmray/__init__.py b/capa/features/extractors/vmray/__init__.py
index 662153603..1b67647fe 100644
--- a/capa/features/extractors/vmray/__init__.py
+++ b/capa/features/extractors/vmray/__init__.py
@@ -17,12 +17,14 @@ def __init__(self, sv2: SummaryV2, flog: Analysis):
         self.exports: Dict[int, str] = {}
         self.imports: Dict[int, str] = {}
         self.sections: Dict[int, str] = {}
+        self.base_address: int
 
         self.sample_file_name: str
         self.sample_file_analysis: File
         self.sample_file_static_data: Optional[StaticData]
 
         self._find_sample_file()
+        self._compute_base_address()
         self._compute_exports()
         self._compute_sections()
 
@@ -38,6 +40,10 @@ def _find_sample_file(self):
 
                 break
 
+    def _compute_base_address(self):
+        if self.sample_file_static_data and self.sample_file_static_data.pe:
+            self.base_address = self.sample_file_static_data.pe.basic_info.image_base
+
     def _compute_exports(self):
         if self.sample_file_static_data and self.sample_file_static_data.pe:
             for export in self.sample_file_static_data.pe.exports:
diff --git a/capa/features/extractors/vmray/extractor.py b/capa/features/extractors/vmray/extractor.py
index 08c9abdee..ef6e824b2 100644
--- a/capa/features/extractors/vmray/extractor.py
+++ b/capa/features/extractors/vmray/extractor.py
@@ -14,7 +14,7 @@
 import capa.helpers
 import capa.features.extractors.vmray.file
 from capa.features.common import Feature
-from capa.features.address import Address
+from capa.features.address import Address, AbsoluteVirtualAddress
 from capa.features.extractors.vmray import VMRayAnalysis
 from capa.features.extractors.vmray.models import Analysis, SummaryV2
 from capa.features.extractors.base_extractor import DynamicFeatureExtractor
@@ -38,6 +38,10 @@ def from_archive(cls, archive_path: Path):
 
         return cls(VMRayAnalysis(sv2, flog))
 
+    def get_base_address(self) -> Address:
+        # value according to the PE header, the actual trace may use a different imagebase
+        return AbsoluteVirtualAddress(self.analysis.base_address)
+
     def extract_file_features(self) -> Iterator[Tuple[Feature, Address]]:
         yield from capa.features.extractors.vmray.file.extract_features(self.analysis)
 
@@ -50,3 +54,5 @@ def extract_file_features(self) -> Iterator[Tuple[Feature, Address]]:
     extractor = VMRayExtractor.from_archive(input_path)
     for feat, addr in extractor.extract_file_features():
         print(f"{feat} -> {addr}")
+
+    print(f"base address: {hex(extractor.get_base_address())}")
\ No newline at end of file
diff --git a/capa/features/extractors/vmray/models.py b/capa/features/extractors/vmray/models.py
index 2a047fe01..0aca68888 100644
--- a/capa/features/extractors/vmray/models.py
+++ b/capa/features/extractors/vmray/models.py
@@ -113,7 +113,7 @@ class PEFileSection(BaseModel):
 
 class PEFile(BaseModel):
     _type: str
-    basic_info: Optional[PEFileBasicInfo] = None
+    basic_info: PEFileBasicInfo
     exports: List[PEFileExport] = []
     imports: List[PEFileImportModule] = []
     sections: List[PEFileSection] = []

From 346a0693ad3e63b894cd38b8f4ab0fa6b6d21190 Mon Sep 17 00:00:00 2001
From: Mike Hunhoff <mike.hunhoff@gmail.com>
Date: Thu, 13 Jun 2024 16:48:12 -0600
Subject: [PATCH 008/105] vmray: clean up VMRayAnalysis

---
 capa/features/extractors/vmray/__init__.py  | 10 +++++-----
 capa/features/extractors/vmray/extractor.py |  2 +-
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/capa/features/extractors/vmray/__init__.py b/capa/features/extractors/vmray/__init__.py
index 1b67647fe..0ee920818 100644
--- a/capa/features/extractors/vmray/__init__.py
+++ b/capa/features/extractors/vmray/__init__.py
@@ -21,7 +21,7 @@ def __init__(self, sv2: SummaryV2, flog: Analysis):
 
         self.sample_file_name: str
         self.sample_file_analysis: File
-        self.sample_file_static_data: Optional[StaticData]
+        self.sample_file_static_data: StaticData
 
         self._find_sample_file()
         self._compute_base_address()
@@ -36,16 +36,16 @@ def _find_sample_file(self):
                 self.sample_file_analysis = file_analysis
 
                 if file_analysis.ref_static_data:
-                    self.sample_file_static_data = self.sv2.static_data.get(file_analysis.ref_static_data.path[1])
+                    self.sample_file_static_data = self.sv2.static_data[file_analysis.ref_static_data.path[1]]
 
                 break
 
     def _compute_base_address(self):
-        if self.sample_file_static_data and self.sample_file_static_data.pe:
+        if self.sample_file_static_data.pe:
             self.base_address = self.sample_file_static_data.pe.basic_info.image_base
 
     def _compute_exports(self):
-        if self.sample_file_static_data and self.sample_file_static_data.pe:
+        if self.sample_file_static_data.pe:
             for export in self.sample_file_static_data.pe.exports:
                 self.exports[export.address] = export.api.name
 
@@ -54,6 +54,6 @@ def _compute_imports(self):
         ...
 
     def _compute_sections(self):
-        if self.sample_file_static_data and self.sample_file_static_data.pe:
+        if self.sample_file_static_data.pe:
             for section in self.sample_file_static_data.pe.sections:
                 self.sections[section.virtual_address] = section.name
diff --git a/capa/features/extractors/vmray/extractor.py b/capa/features/extractors/vmray/extractor.py
index ef6e824b2..af874486a 100644
--- a/capa/features/extractors/vmray/extractor.py
+++ b/capa/features/extractors/vmray/extractor.py
@@ -55,4 +55,4 @@ def extract_file_features(self) -> Iterator[Tuple[Feature, Address]]:
     for feat, addr in extractor.extract_file_features():
         print(f"{feat} -> {addr}")
 
-    print(f"base address: {hex(extractor.get_base_address())}")
\ No newline at end of file
+    print(f"base address: {hex(extractor.get_base_address())}")

From 7e079d4d35d0d09a8d46ec989b682462e7e71469 Mon Sep 17 00:00:00 2001
From: Mike Hunhoff <mike.hunhoff@gmail.com>
Date: Thu, 13 Jun 2024 16:52:25 -0600
Subject: [PATCH 009/105] vmray: restrict analysis to PE files

---
 capa/features/extractors/vmray/__init__.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/capa/features/extractors/vmray/__init__.py b/capa/features/extractors/vmray/__init__.py
index 0ee920818..a05282785 100644
--- a/capa/features/extractors/vmray/__init__.py
+++ b/capa/features/extractors/vmray/__init__.py
@@ -5,8 +5,9 @@
 # Unless required by applicable law or agreed to in writing, software distributed under the License
 #  is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and limitations under the License.
-from typing import Dict, Optional
+from typing import Dict
 
+from capa.exceptions import UnsupportedFormatError
 from capa.features.extractors.vmray.models import File, Analysis, SummaryV2, StaticData
 
 
@@ -28,6 +29,9 @@ def __init__(self, sv2: SummaryV2, flog: Analysis):
         self._compute_exports()
         self._compute_sections()
 
+        if not self.sample_file_static_data.pe:
+            raise UnsupportedFormatError("VMRay feature extractor only supports PE at this time")
+
     def _find_sample_file(self):
         for file_name, file_analysis in self.sv2.files.items():
             if file_analysis.is_sample:

From 00cb7924e1a992338a49616ca1c3e9ff058f9935 Mon Sep 17 00:00:00 2001
From: Mike Hunhoff <mike.hunhoff@gmail.com>
Date: Thu, 13 Jun 2024 17:02:50 -0600
Subject: [PATCH 010/105] vmray: clean up pydantic models and add sample hash
 extraction

---
 capa/features/extractors/vmray/extractor.py | 12 ++++++++++--
 capa/features/extractors/vmray/models.py    | 18 ++++++++----------
 2 files changed, 18 insertions(+), 12 deletions(-)

diff --git a/capa/features/extractors/vmray/extractor.py b/capa/features/extractors/vmray/extractor.py
index af874486a..99c6704c1 100644
--- a/capa/features/extractors/vmray/extractor.py
+++ b/capa/features/extractors/vmray/extractor.py
@@ -17,13 +17,21 @@
 from capa.features.address import Address, AbsoluteVirtualAddress
 from capa.features.extractors.vmray import VMRayAnalysis
 from capa.features.extractors.vmray.models import Analysis, SummaryV2
-from capa.features.extractors.base_extractor import DynamicFeatureExtractor
+from capa.features.extractors.base_extractor import SampleHashes, DynamicFeatureExtractor
 
 # TODO also/or look into xmltodict?
 
 
 class VMRayExtractor(DynamicFeatureExtractor):
-    def __init__(self, analysis):
+    def __init__(self, analysis: VMRayAnalysis):
+        super().__init__(
+            hashes=SampleHashes(
+                md5=analysis.sample_file_analysis.hash_values.md5.lower(),
+                sha1=analysis.sample_file_analysis.hash_values.sha1.lower(),
+                sha256=analysis.sample_file_analysis.hash_values.sha256.lower(),
+            )
+        )
+
         self.analysis = analysis
 
     @classmethod
diff --git a/capa/features/extractors/vmray/models.py b/capa/features/extractors/vmray/models.py
index 0aca68888..586024dff 100644
--- a/capa/features/extractors/vmray/models.py
+++ b/capa/features/extractors/vmray/models.py
@@ -51,7 +51,6 @@ class Analysis(BaseXmlModel, tag="analysis"):
 
 
 class GenericReference(BaseModel):
-    _type: str
     path: List[str]
     source: str
 
@@ -60,7 +59,6 @@ class StaticDataReference(GenericReference): ...
 
 
 class PEFileBasicInfo(BaseModel):
-    _type: str
     compile_time: str
     file_type: str
     image_base: int
@@ -74,19 +72,16 @@ class PEFileBasicInfo(BaseModel):
 
 
 class API(BaseModel):
-    _type: str
     name: str
     ordinal: Optional[int] = None
 
 
 class PEFileExport(BaseModel):
-    _type: str
     address: int
     api: API
 
 
 class PEFileImport(BaseModel):
-    _type: str
     address: int
     api: API
     thunk_offset: int
@@ -95,13 +90,11 @@ class PEFileImport(BaseModel):
 
 
 class PEFileImportModule(BaseModel):
-    _type: str
     dll: str
     apis: List[PEFileImport]
 
 
 class PEFileSection(BaseModel):
-    _type: str
     entropy: float
     flags: List[str] = []
     name: str
@@ -112,7 +105,6 @@ class PEFileSection(BaseModel):
 
 
 class PEFile(BaseModel):
-    _type: str
     basic_info: PEFileBasicInfo
     exports: List[PEFileExport] = []
     imports: List[PEFileImportModule] = []
@@ -123,10 +115,16 @@ class StaticData(BaseModel):
     pe: Optional[PEFile] = None
 
 
+class FileHashes(BaseModel):
+    md5: str
+    sha1: str
+    sha256: str
+    ssdeep: str
+
+
 class File(BaseModel):
-    _type: str
     categories: List[str]
-    hash_values: Dict[str, str]
+    hash_values: FileHashes
     is_artifact: bool
     is_ioc: bool
     is_sample: bool

From 8b913e05448f337c521c04afed50c372979453b0 Mon Sep 17 00:00:00 2001
From: Mike Hunhoff <mike.hunhoff@gmail.com>
Date: Fri, 14 Jun 2024 09:32:02 -0600
Subject: [PATCH 011/105] vmray: extract global features for PE files

---
 capa/features/extractors/vmray/extractor.py | 24 ++++++---
 capa/features/extractors/vmray/global_.py   | 59 +++++++++++++++++++++
 capa/features/extractors/vmray/models.py    |  6 +++
 3 files changed, 82 insertions(+), 7 deletions(-)
 create mode 100644 capa/features/extractors/vmray/global_.py

diff --git a/capa/features/extractors/vmray/extractor.py b/capa/features/extractors/vmray/extractor.py
index 99c6704c1..74b3a6b55 100644
--- a/capa/features/extractors/vmray/extractor.py
+++ b/capa/features/extractors/vmray/extractor.py
@@ -13,6 +13,7 @@
 
 import capa.helpers
 import capa.features.extractors.vmray.file
+import capa.features.extractors.vmray.global_
 from capa.features.common import Feature
 from capa.features.address import Address, AbsoluteVirtualAddress
 from capa.features.extractors.vmray import VMRayAnalysis
@@ -34,6 +35,19 @@ def __init__(self, analysis: VMRayAnalysis):
 
         self.analysis = analysis
 
+        # pre-compute these because we'll yield them at *every* scope.
+        self.global_features = list(capa.features.extractors.vmray.global_.extract_features(self.analysis))
+
+    def get_base_address(self) -> Address:
+        # value according to the PE header, the actual trace may use a different imagebase
+        return AbsoluteVirtualAddress(self.analysis.base_address)
+
+    def extract_file_features(self) -> Iterator[Tuple[Feature, Address]]:
+        yield from capa.features.extractors.vmray.file.extract_features(self.analysis)
+
+    def extract_global_features(self) -> Iterator[Tuple[Feature, Address]]:
+        yield from self.global_features
+
     @classmethod
     def from_archive(cls, archive_path: Path):
         archive = ZipFile(archive_path, "r")
@@ -46,13 +60,6 @@ def from_archive(cls, archive_path: Path):
 
         return cls(VMRayAnalysis(sv2, flog))
 
-    def get_base_address(self) -> Address:
-        # value according to the PE header, the actual trace may use a different imagebase
-        return AbsoluteVirtualAddress(self.analysis.base_address)
-
-    def extract_file_features(self) -> Iterator[Tuple[Feature, Address]]:
-        yield from capa.features.extractors.vmray.file.extract_features(self.analysis)
-
 
 if __name__ == "__main__":
     import sys
@@ -60,7 +67,10 @@ def extract_file_features(self) -> Iterator[Tuple[Feature, Address]]:
     input_path = Path(sys.argv[1])
 
     extractor = VMRayExtractor.from_archive(input_path)
+
     for feat, addr in extractor.extract_file_features():
         print(f"{feat} -> {addr}")
+    for feat, addr in extractor.extract_global_features():
+        print(f"{feat} -> {addr}")
 
     print(f"base address: {hex(extractor.get_base_address())}")
diff --git a/capa/features/extractors/vmray/global_.py b/capa/features/extractors/vmray/global_.py
new file mode 100644
index 000000000..1c9da0a68
--- /dev/null
+++ b/capa/features/extractors/vmray/global_.py
@@ -0,0 +1,59 @@
+# Copyright (C) 2024 Mandiant, Inc. All Rights Reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at: [package root]/LICENSE.txt
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+#  is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and limitations under the License.
+
+import logging
+from typing import Tuple, Iterator
+
+from capa.features.common import OS, ARCH_I386, FORMAT_PE, ARCH_AMD64, OS_WINDOWS, Arch, Format, Feature
+from capa.features.address import NO_ADDRESS, Address
+from capa.features.extractors.vmray import VMRayAnalysis
+
+logger = logging.getLogger(__name__)
+
+
+def extract_arch(analysis: VMRayAnalysis) -> Iterator[Tuple[Feature, Address]]:
+    sample_type: str = analysis.sv2.analysis_metadata.sample_type
+
+    if "x86-32" in sample_type:
+        yield Arch(ARCH_I386), NO_ADDRESS
+    elif "x86-64" in sample_type:
+        yield Arch(ARCH_AMD64), NO_ADDRESS
+    else:
+        logger.warning("unrecognized arch: %s", sample_type)
+        raise ValueError(f"unrecognized arch from the VMRay report; output of file command: {sample_type}")
+
+
+def extract_format(analysis: VMRayAnalysis) -> Iterator[Tuple[Feature, Address]]:
+    if analysis.sample_file_static_data.pe:
+        yield Format(FORMAT_PE), NO_ADDRESS
+    else:
+        logger.warning("unrecognized file format: %s", analysis.sv2.analysis_metadata.sample_type)
+        raise ValueError("unrecognized file format from the VMRay report: {analysis.sv2.analysis_metadata.sample_type}")
+
+
+def extract_os(analysis: VMRayAnalysis) -> Iterator[Tuple[Feature, Address]]:
+    sample_type: str = analysis.sv2.analysis_metadata.sample_type
+
+    if "windows" in sample_type.lower():
+        yield OS(OS_WINDOWS), NO_ADDRESS
+    else:
+        logger.warning("unrecognized OS: %s", sample_type)
+        raise ValueError("unrecognized OS from the VMRay report: {sample_type}")
+
+
+def extract_features(analysis: VMRayAnalysis) -> Iterator[Tuple[Feature, Address]]:
+    for global_handler in GLOBAL_HANDLER:
+        for feature, addr in global_handler(analysis):
+            yield feature, addr
+
+
+GLOBAL_HANDLER = (
+    extract_format,
+    extract_os,
+    extract_arch,
+)
diff --git a/capa/features/extractors/vmray/models.py b/capa/features/extractors/vmray/models.py
index 586024dff..ea94404a2 100644
--- a/capa/features/extractors/vmray/models.py
+++ b/capa/features/extractors/vmray/models.py
@@ -139,6 +139,12 @@ class File(BaseModel):
     verdict: str
 
 
+class AnalysisMetadata(BaseModel):
+    sample_type: str
+    submission_filename: str
+
+
 class SummaryV2(BaseModel):
     files: Dict[str, File]
     static_data: Dict[str, StaticData]
+    analysis_metadata: AnalysisMetadata

From 654804878f203669d6c569da4d4743d227acc16b Mon Sep 17 00:00:00 2001
From: Mike Hunhoff <mike.hunhoff@gmail.com>
Date: Fri, 14 Jun 2024 09:34:59 -0600
Subject: [PATCH 012/105] vmray: clean up global_.py debug output

---
 capa/features/extractors/vmray/global_.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/capa/features/extractors/vmray/global_.py b/capa/features/extractors/vmray/global_.py
index 1c9da0a68..dd18a1511 100644
--- a/capa/features/extractors/vmray/global_.py
+++ b/capa/features/extractors/vmray/global_.py
@@ -25,7 +25,7 @@ def extract_arch(analysis: VMRayAnalysis) -> Iterator[Tuple[Feature, Address]]:
         yield Arch(ARCH_AMD64), NO_ADDRESS
     else:
         logger.warning("unrecognized arch: %s", sample_type)
-        raise ValueError(f"unrecognized arch from the VMRay report; output of file command: {sample_type}")
+        raise ValueError(f"unrecognized arch from the VMRay report: {sample_type}")
 
 
 def extract_format(analysis: VMRayAnalysis) -> Iterator[Tuple[Feature, Address]]:
@@ -33,7 +33,7 @@ def extract_format(analysis: VMRayAnalysis) -> Iterator[Tuple[Feature, Address]]
         yield Format(FORMAT_PE), NO_ADDRESS
     else:
         logger.warning("unrecognized file format: %s", analysis.sv2.analysis_metadata.sample_type)
-        raise ValueError("unrecognized file format from the VMRay report: {analysis.sv2.analysis_metadata.sample_type}")
+        raise ValueError(f"unrecognized file format from the VMRay report: {analysis.sv2.analysis_metadata.sample_type}")
 
 
 def extract_os(analysis: VMRayAnalysis) -> Iterator[Tuple[Feature, Address]]:
@@ -43,7 +43,7 @@ def extract_os(analysis: VMRayAnalysis) -> Iterator[Tuple[Feature, Address]]:
         yield OS(OS_WINDOWS), NO_ADDRESS
     else:
         logger.warning("unrecognized OS: %s", sample_type)
-        raise ValueError("unrecognized OS from the VMRay report: {sample_type}")
+        raise ValueError(f"unrecognized OS from the VMRay report: {sample_type}")
 
 
 def extract_features(analysis: VMRayAnalysis) -> Iterator[Tuple[Feature, Address]]:

From f3d69529b092f92f3732fbfc71061ab4faf4ad62 Mon Sep 17 00:00:00 2001
From: Mike Hunhoff <mike.hunhoff@gmail.com>
Date: Tue, 18 Jun 2024 13:27:40 -0600
Subject: [PATCH 013/105] vmray: invoke VMRay feature extractor from capa.main

---
 capa/features/common.py                     |  2 +
 capa/features/extractors/vmray/extractor.py | 60 ++++++++++++++-------
 capa/features/extractors/vmray/global_.py   |  4 +-
 capa/helpers.py                             | 29 ++++++----
 capa/loader.py                              | 12 +++++
 capa/main.py                                | 18 ++++++-
 6 files changed, 94 insertions(+), 31 deletions(-)

diff --git a/capa/features/common.py b/capa/features/common.py
index cb938f299..6ec1ce8b1 100644
--- a/capa/features/common.py
+++ b/capa/features/common.py
@@ -461,6 +461,7 @@ def evaluate(self, features: "capa.engine.FeatureSet", short_circuit=True):
 FORMAT_SC32 = "sc32"
 FORMAT_SC64 = "sc64"
 FORMAT_CAPE = "cape"
+FORMAT_VMRAY = "vmray"
 FORMAT_FREEZE = "freeze"
 FORMAT_RESULT = "result"
 STATIC_FORMATS = {
@@ -474,6 +475,7 @@ def evaluate(self, features: "capa.engine.FeatureSet", short_circuit=True):
 }
 DYNAMIC_FORMATS = {
     FORMAT_CAPE,
+    FORMAT_VMRAY,
     FORMAT_FREEZE,
     FORMAT_RESULT,
 }
diff --git a/capa/features/extractors/vmray/extractor.py b/capa/features/extractors/vmray/extractor.py
index 74b3a6b55..e5e7b7746 100644
--- a/capa/features/extractors/vmray/extractor.py
+++ b/capa/features/extractors/vmray/extractor.py
@@ -18,7 +18,13 @@
 from capa.features.address import Address, AbsoluteVirtualAddress
 from capa.features.extractors.vmray import VMRayAnalysis
 from capa.features.extractors.vmray.models import Analysis, SummaryV2
-from capa.features.extractors.base_extractor import SampleHashes, DynamicFeatureExtractor
+from capa.features.extractors.base_extractor import (
+    CallHandle,
+    SampleHashes,
+    ThreadHandle,
+    ProcessHandle,
+    DynamicFeatureExtractor,
+)
 
 # TODO also/or look into xmltodict?
 
@@ -48,29 +54,47 @@ def extract_file_features(self) -> Iterator[Tuple[Feature, Address]]:
     def extract_global_features(self) -> Iterator[Tuple[Feature, Address]]:
         yield from self.global_features
 
-    @classmethod
-    def from_archive(cls, archive_path: Path):
-        archive = ZipFile(archive_path, "r")
+    def get_processes(self) -> Iterator[ProcessHandle]:
+        # TODO (meh)
+        yield from []
 
-        sv2_json = json.loads(archive.read("logs/summary_v2.json", pwd=b"infected"))
-        sv2 = SummaryV2.model_validate(sv2_json)
+    def extract_process_features(self, ph: ProcessHandle) -> Iterator[Tuple[Feature, Address]]:
+        # TODO (meh)
+        yield from []
 
-        flog_xml = archive.read("logs/flog.xml", pwd=b"infected")
-        flog = Analysis.from_xml(flog_xml)
+    def get_process_name(self, ph) -> str:
+        # TODO (meh)
+        raise NotImplementedError()
 
-        return cls(VMRayAnalysis(sv2, flog))
+    def get_threads(self, ph: ProcessHandle) -> Iterator[ThreadHandle]:
+        # TODO (meh)
+        yield from []
 
+    def extract_thread_features(self, ph: ProcessHandle, th: ThreadHandle) -> Iterator[Tuple[Feature, Address]]:
+        # TODO (meh)
+        yield from []
 
-if __name__ == "__main__":
-    import sys
+    def get_calls(self, ph: ProcessHandle, th: ThreadHandle) -> Iterator[CallHandle]:
+        # TODO (meh)
+        yield from []
 
-    input_path = Path(sys.argv[1])
+    def extract_call_features(
+        self, ph: ProcessHandle, th: ThreadHandle, ch: CallHandle
+    ) -> Iterator[Tuple[Feature, Address]]:
+        # TODO (meh)
+        yield from []
 
-    extractor = VMRayExtractor.from_archive(input_path)
+    def get_call_name(self, ph, th, ch) -> str:
+        # TODO (meh)
+        raise NotImplementedError()
 
-    for feat, addr in extractor.extract_file_features():
-        print(f"{feat} -> {addr}")
-    for feat, addr in extractor.extract_global_features():
-        print(f"{feat} -> {addr}")
+    @classmethod
+    def from_zipfile(cls, zipfile_path: Path):
+        with ZipFile(zipfile_path, "r") as zipfile:
+            sv2_json = json.loads(zipfile.read("logs/summary_v2.json", pwd=b"infected"))
+            sv2 = SummaryV2.model_validate(sv2_json)
 
-    print(f"base address: {hex(extractor.get_base_address())}")
+            flog_xml = zipfile.read("logs/flog.xml", pwd=b"infected")
+            flog = Analysis.from_xml(flog_xml)
+
+        return cls(VMRayAnalysis(sv2, flog))
diff --git a/capa/features/extractors/vmray/global_.py b/capa/features/extractors/vmray/global_.py
index dd18a1511..c2a7e3b78 100644
--- a/capa/features/extractors/vmray/global_.py
+++ b/capa/features/extractors/vmray/global_.py
@@ -33,7 +33,9 @@ def extract_format(analysis: VMRayAnalysis) -> Iterator[Tuple[Feature, Address]]
         yield Format(FORMAT_PE), NO_ADDRESS
     else:
         logger.warning("unrecognized file format: %s", analysis.sv2.analysis_metadata.sample_type)
-        raise ValueError(f"unrecognized file format from the VMRay report: {analysis.sv2.analysis_metadata.sample_type}")
+        raise ValueError(
+            f"unrecognized file format from the VMRay report: {analysis.sv2.analysis_metadata.sample_type}"
+        )
 
 
 def extract_os(analysis: VMRayAnalysis) -> Iterator[Tuple[Feature, Address]]:
diff --git a/capa/helpers.py b/capa/helpers.py
index 77380c7ed..c5880edc7 100644
--- a/capa/helpers.py
+++ b/capa/helpers.py
@@ -12,8 +12,9 @@
 import logging
 import contextlib
 import importlib.util
-from typing import NoReturn
+from typing import List, NoReturn
 from pathlib import Path
+from zipfile import ZipFile
 
 import tqdm
 
@@ -23,6 +24,7 @@
     FORMAT_CAPE,
     FORMAT_SC32,
     FORMAT_SC64,
+    FORMAT_VMRAY,
     FORMAT_DOTNET,
     FORMAT_FREEZE,
     FORMAT_UNKNOWN,
@@ -31,7 +33,7 @@
 
 EXTENSIONS_SHELLCODE_32 = ("sc32", "raw32")
 EXTENSIONS_SHELLCODE_64 = ("sc64", "raw64")
-EXTENSIONS_DYNAMIC = ("json", "json_", "json.gz")
+EXTENSIONS_DYNAMIC = ("json", "json_", "json.gz", ".zip")
 EXTENSIONS_ELF = "elf_"
 EXTENSIONS_FREEZE = "frz"
 
@@ -83,14 +85,21 @@ def load_json_from_path(json_path: Path):
 
 
 def get_format_from_report(sample: Path) -> str:
-    report = load_json_from_path(sample)
-    if "CAPE" in report:
-        return FORMAT_CAPE
-
-    if "target" in report and "info" in report and "behavior" in report:
-        # CAPE report that's missing the "CAPE" key,
-        # which is not going to be much use, but its correct.
-        return FORMAT_CAPE
+    if not sample.name.endswith(".zip"):
+        report = load_json_from_path(sample)
+        if "CAPE" in report:
+            return FORMAT_CAPE
+
+        if "target" in report and "info" in report and "behavior" in report:
+            # CAPE report that's missing the "CAPE" key,
+            # which is not going to be much use, but its correct.
+            return FORMAT_CAPE
+    else:
+        with ZipFile(sample, "r") as zipfile:
+            namelist: List[str] = zipfile.namelist()
+            if "logs/summary_v2.json" in namelist and "logs/flog.xml" in namelist:
+                # assume VMRay zipfile at a minimum has these files
+                return FORMAT_VMRAY
 
     return FORMAT_UNKNOWN
 
diff --git a/capa/loader.py b/capa/loader.py
index e741175e7..98c08bda7 100644
--- a/capa/loader.py
+++ b/capa/loader.py
@@ -44,6 +44,7 @@
     FORMAT_CAPE,
     FORMAT_SC32,
     FORMAT_SC64,
+    FORMAT_VMRAY,
     FORMAT_DOTNET,
 )
 from capa.features.address import Address
@@ -61,6 +62,7 @@
 BACKEND_BINJA = "binja"
 BACKEND_PEFILE = "pefile"
 BACKEND_CAPE = "cape"
+BACKEND_VMRAY = "vmray"
 BACKEND_FREEZE = "freeze"
 
 
@@ -199,6 +201,11 @@ def get_extractor(
         report = capa.helpers.load_json_from_path(input_path)
         return capa.features.extractors.cape.extractor.CapeExtractor.from_report(report)
 
+    elif backend == BACKEND_VMRAY:
+        import capa.features.extractors.vmray.extractor
+
+        return capa.features.extractors.vmray.extractor.VMRayExtractor.from_zipfile(input_path)
+
     elif backend == BACKEND_DOTNET:
         import capa.features.extractors.dnfile.extractor
 
@@ -316,6 +323,11 @@ def get_file_extractors(input_file: Path, input_format: str) -> List[FeatureExtr
         report = capa.helpers.load_json_from_path(input_file)
         file_extractors.append(capa.features.extractors.cape.extractor.CapeExtractor.from_report(report))
 
+    elif input_format == FORMAT_VMRAY:
+        import capa.features.extractors.vmray.extractor
+
+        file_extractors.append(capa.features.extractors.vmray.extractor.VMRayExtractor.from_zipfile(input_file))
+
     return file_extractors
 
 
diff --git a/capa/main.py b/capa/main.py
index eb43769d2..7bca16e16 100644
--- a/capa/main.py
+++ b/capa/main.py
@@ -42,7 +42,15 @@
 import capa.features.extractors.common
 from capa.rules import RuleSet
 from capa.engine import MatchResults
-from capa.loader import BACKEND_VIV, BACKEND_CAPE, BACKEND_BINJA, BACKEND_DOTNET, BACKEND_FREEZE, BACKEND_PEFILE
+from capa.loader import (
+    BACKEND_VIV,
+    BACKEND_CAPE,
+    BACKEND_BINJA,
+    BACKEND_VMRAY,
+    BACKEND_DOTNET,
+    BACKEND_FREEZE,
+    BACKEND_PEFILE,
+)
 from capa.helpers import (
     get_file_taste,
     get_auto_format,
@@ -70,6 +78,7 @@
     FORMAT_CAPE,
     FORMAT_SC32,
     FORMAT_SC64,
+    FORMAT_VMRAY,
     FORMAT_DOTNET,
     FORMAT_FREEZE,
     FORMAT_RESULT,
@@ -232,6 +241,7 @@ def install_common_args(parser, wanted=None):
             (FORMAT_SC32, "32-bit shellcode"),
             (FORMAT_SC64, "64-bit shellcode"),
             (FORMAT_CAPE, "CAPE sandbox report"),
+            (FORMAT_VMRAY, "VMRay sandbox report"),
             (FORMAT_FREEZE, "features previously frozen by capa"),
         ]
         format_help = ", ".join([f"{f[0]}: {f[1]}" for f in formats])
@@ -253,6 +263,7 @@ def install_common_args(parser, wanted=None):
             (BACKEND_DOTNET, ".NET"),
             (BACKEND_FREEZE, "capa freeze"),
             (BACKEND_CAPE, "CAPE"),
+            (BACKEND_VMRAY, "VMRay"),
         ]
         backend_help = ", ".join([f"{f[0]}: {f[1]}" for f in backends])
         parser.add_argument(
@@ -505,6 +516,9 @@ def get_backend_from_cli(args, input_format: str) -> str:
     if input_format == FORMAT_CAPE:
         return BACKEND_CAPE
 
+    elif input_format == FORMAT_VMRAY:
+        return BACKEND_VMRAY
+
     elif input_format == FORMAT_DOTNET:
         return BACKEND_DOTNET
 
@@ -529,7 +543,7 @@ def get_sample_path_from_cli(args, backend: str) -> Optional[Path]:
     raises:
       ShouldExitError: if the program is invoked incorrectly and should exit.
     """
-    if backend == BACKEND_CAPE:
+    if backend in (BACKEND_CAPE, BACKEND_VMRAY):
         return None
     else:
         return args.input_file

From 8f32b7fc65767b6e09ae1462163ee79044bff558 Mon Sep 17 00:00:00 2001
From: Mike Hunhoff <mike.hunhoff@gmail.com>
Date: Tue, 18 Jun 2024 14:32:11 -0600
Subject: [PATCH 014/105] vmray: emit process handles

---
 capa/features/extractors/vmray/extractor.py |  3 +--
 capa/features/extractors/vmray/file.py      | 17 ++++++++++++--
 capa/features/extractors/vmray/models.py    | 26 ++++++++++++++++++++-
 3 files changed, 41 insertions(+), 5 deletions(-)

diff --git a/capa/features/extractors/vmray/extractor.py b/capa/features/extractors/vmray/extractor.py
index e5e7b7746..d56748a15 100644
--- a/capa/features/extractors/vmray/extractor.py
+++ b/capa/features/extractors/vmray/extractor.py
@@ -55,8 +55,7 @@ def extract_global_features(self) -> Iterator[Tuple[Feature, Address]]:
         yield from self.global_features
 
     def get_processes(self) -> Iterator[ProcessHandle]:
-        # TODO (meh)
-        yield from []
+        yield from capa.features.extractors.vmray.file.get_processes(self.analysis)
 
     def extract_process_features(self, ph: ProcessHandle) -> Iterator[Tuple[Feature, Address]]:
         # TODO (meh)
diff --git a/capa/features/extractors/vmray/file.py b/capa/features/extractors/vmray/file.py
index da4c39539..5a28b4722 100644
--- a/capa/features/extractors/vmray/file.py
+++ b/capa/features/extractors/vmray/file.py
@@ -6,16 +6,29 @@
 #  is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and limitations under the License.
 import logging
-from typing import Tuple, Iterator
+from typing import Dict, Tuple, Iterator
 
 from capa.features.file import Export, Section
 from capa.features.common import Feature
-from capa.features.address import Address, AbsoluteVirtualAddress
+from capa.features.address import Address, ProcessAddress, AbsoluteVirtualAddress
 from capa.features.extractors.vmray import VMRayAnalysis
+from capa.features.extractors.vmray.models import Process
+from capa.features.extractors.base_extractor import ProcessHandle
 
 logger = logging.getLogger(__name__)
 
 
+def get_processes(analysis: VMRayAnalysis) -> Iterator[ProcessHandle]:
+    processes: Dict[str, Process] = analysis.sv2.processes
+
+    for _, process in processes.items():
+        pid = process.os_pid
+        ppid = processes[process.ref_parent_process.path[1]].os_pid if process.ref_parent_process else 0
+
+        addr = ProcessAddress(pid=pid, ppid=ppid)
+        yield ProcessHandle(address=addr, inner=process)
+
+
 def extract_export_names(analysis: VMRayAnalysis) -> Iterator[Tuple[Feature, Address]]:
     for addr, name in analysis.exports.items():
         yield Export(name), AbsoluteVirtualAddress(addr)
diff --git a/capa/features/extractors/vmray/models.py b/capa/features/extractors/vmray/models.py
index ea94404a2..958d59d94 100644
--- a/capa/features/extractors/vmray/models.py
+++ b/capa/features/extractors/vmray/models.py
@@ -139,12 +139,36 @@ class File(BaseModel):
     verdict: str
 
 
+class Process(BaseModel):
+    bitness: int
+    is_artifact: bool
+    is_ioc: bool
+    monitor_id: int
+    monitor_reason: str
+    os_pid: int
+    filename: str
+    ref_parent_process: Optional[GenericReference] = None
+
+
+class Artifacts(BaseModel):
+    ref_processes: List[GenericReference] = []
+    ref_domains: List[GenericReference] = []
+    ref_filenames: List[GenericReference] = []
+    ref_files: List[GenericReference] = []
+    ref_ip_addresses: List[GenericReference] = []
+    ref_mutexes: List[GenericReference] = []
+    ref_registry_records: List[GenericReference] = []
+
+
 class AnalysisMetadata(BaseModel):
     sample_type: str
     submission_filename: str
 
 
 class SummaryV2(BaseModel):
+    analysis_metadata: AnalysisMetadata
+    artifacts: Artifacts
+
     files: Dict[str, File]
     static_data: Dict[str, StaticData]
-    analysis_metadata: AnalysisMetadata
+    processes: Dict[str, Process]

From b3ebf80d9b1858cb61da060ce376b971d30f5e4d Mon Sep 17 00:00:00 2001
From: Mike Hunhoff <mike.hunhoff@gmail.com>
Date: Tue, 18 Jun 2024 14:41:47 -0600
Subject: [PATCH 015/105] vmray: emit process name

---
 capa/features/extractors/vmray/extractor.py | 6 +++---
 capa/features/extractors/vmray/models.py    | 1 +
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/capa/features/extractors/vmray/extractor.py b/capa/features/extractors/vmray/extractor.py
index d56748a15..8ec94b756 100644
--- a/capa/features/extractors/vmray/extractor.py
+++ b/capa/features/extractors/vmray/extractor.py
@@ -17,7 +17,7 @@
 from capa.features.common import Feature
 from capa.features.address import Address, AbsoluteVirtualAddress
 from capa.features.extractors.vmray import VMRayAnalysis
-from capa.features.extractors.vmray.models import Analysis, SummaryV2
+from capa.features.extractors.vmray.models import Process, Analysis, SummaryV2
 from capa.features.extractors.base_extractor import (
     CallHandle,
     SampleHashes,
@@ -62,8 +62,8 @@ def extract_process_features(self, ph: ProcessHandle) -> Iterator[Tuple[Feature,
         yield from []
 
     def get_process_name(self, ph) -> str:
-        # TODO (meh)
-        raise NotImplementedError()
+        process: Process = ph.inner
+        return process.image_name
 
     def get_threads(self, ph: ProcessHandle) -> Iterator[ThreadHandle]:
         # TODO (meh)
diff --git a/capa/features/extractors/vmray/models.py b/capa/features/extractors/vmray/models.py
index 958d59d94..4ee6e9e95 100644
--- a/capa/features/extractors/vmray/models.py
+++ b/capa/features/extractors/vmray/models.py
@@ -147,6 +147,7 @@ class Process(BaseModel):
     monitor_reason: str
     os_pid: int
     filename: str
+    image_name: str
     ref_parent_process: Optional[GenericReference] = None
 
 

From e5fa800ffba22f131343209005feffc758b6770b Mon Sep 17 00:00:00 2001
From: Mike Hunhoff <mike.hunhoff@gmail.com>
Date: Tue, 18 Jun 2024 14:45:08 -0600
Subject: [PATCH 016/105] vmray: emit empty thread features

---
 capa/features/extractors/vmray/extractor.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/capa/features/extractors/vmray/extractor.py b/capa/features/extractors/vmray/extractor.py
index 8ec94b756..19eb3b61e 100644
--- a/capa/features/extractors/vmray/extractor.py
+++ b/capa/features/extractors/vmray/extractor.py
@@ -70,7 +70,8 @@ def get_threads(self, ph: ProcessHandle) -> Iterator[ThreadHandle]:
         yield from []
 
     def extract_thread_features(self, ph: ProcessHandle, th: ThreadHandle) -> Iterator[Tuple[Feature, Address]]:
-        # TODO (meh)
+        # force this routine to be a generator,
+        # but we don't actually have any elements to generate.
         yield from []
 
     def get_calls(self, ph: ProcessHandle, th: ThreadHandle) -> Iterator[CallHandle]:

From d26a80664741be9af285163b14fcf187b2d67b9b Mon Sep 17 00:00:00 2001
From: Mike Hunhoff <mike.hunhoff@gmail.com>
Date: Tue, 18 Jun 2024 14:59:29 -0600
Subject: [PATCH 017/105] vmray: update scripts/show-features.py to emit
 process name from extractor

---
 scripts/show-features.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/scripts/show-features.py b/scripts/show-features.py
index 9813a26dd..30ad2a4be 100644
--- a/scripts/show-features.py
+++ b/scripts/show-features.py
@@ -227,13 +227,13 @@ def print_static_features(functions, extractor: StaticFeatureExtractor):
 
 def print_dynamic_features(processes, extractor: DynamicFeatureExtractor):
     for p in processes:
-        print(f"proc: {p.inner.process_name} (ppid={p.address.ppid}, pid={p.address.pid})")
+        print(f"proc: {extractor.get_process_name(p)} (ppid={p.address.ppid}, pid={p.address.pid})")
 
         for feature, addr in extractor.extract_process_features(p):
             if is_global_feature(feature):
                 continue
 
-            print(f" proc: {p.inner.process_name}: {feature}")
+            print(f" proc: {extractor.get_process_name(p)}: {feature}")
 
             for t in extractor.get_threads(p):
                 print(f"  thread: {t.address.tid}")

From 2b70086467cbf510e537a1fc5199df65dfc12644 Mon Sep 17 00:00:00 2001
From: r-sm2024 <moonsunghyun96@gmail.com>
Date: Tue, 18 Jun 2024 21:29:57 +0000
Subject: [PATCH 018/105] Add VMRayanalysis model and call parser

---
 capa/features/extractors/vmray/call.py   | 57 ++++++++++++++++++++
 capa/features/extractors/vmray/models.py | 67 ++++++++++++++++++++++++
 2 files changed, 124 insertions(+)
 create mode 100644 capa/features/extractors/vmray/call.py
 create mode 100644 capa/features/extractors/vmray/models.py

diff --git a/capa/features/extractors/vmray/call.py b/capa/features/extractors/vmray/call.py
new file mode 100644
index 000000000..f6a3167ad
--- /dev/null
+++ b/capa/features/extractors/vmray/call.py
@@ -0,0 +1,57 @@
+import logging
+from typing import Tuple, Iterator
+
+from capa.helpers import assert_never
+from capa.features.insn import API, Number
+from capa.features.common import String, Feature
+from capa.features.address import Address
+from capa.features.extractors.vmray.models import FunctionCall, Analysis
+from capa.features.extractors.base_extractor import CallHandle, ThreadHandle, ProcessHandle
+
+logger = logging.getLogger(__name__)
+
+
+def extract_function_calls(ph: ProcessHandle, th: ThreadHandle, ch: CallHandle) -> Iterator[Tuple[Feature, Address]]:
+    """
+    this method extracts the given call's features (such as API name and arguments),
+    and returns them as API, Number, and String features.
+
+    args:
+      call: FunctionCall object representing the XML fncall element
+      
+      yields: Feature, address; where Feature is either: API, Number, or String.
+    """
+
+    # Extract API name
+    yield API(ch.inner.name), ch.inner.address  
+
+    # Extract arguments from <in>
+    for param in ch.inner.in_:
+        value = param.value
+        if isinstance(value, str):
+            yield String(value), ch.inner.address
+
+        elif isinstance(value, int):
+            yield Number(value), ch.inner.address
+
+        else:
+            assert_never(value)
+
+    # Extract return value from <out>
+    if ch.inner.out is not None:
+        value = ch.inner.out.value
+        if isinstance(value, str):
+            yield String(value), ch.inner.address
+
+        elif isinstance(value, int):
+            yield Number(value), ch.inner.address
+
+        else:
+            assert_never(value)
+
+def extract_features(analysis: Analysis) -> Iterator[Tuple[Feature, Address]]:
+    '''
+    Extract features from the Analysis object in models.py
+    '''
+    for fncall in analysis.fncalls:
+        yield from extract_function_calls(fncall)
\ No newline at end of file
diff --git a/capa/features/extractors/vmray/models.py b/capa/features/extractors/vmray/models.py
new file mode 100644
index 000000000..d4adaead0
--- /dev/null
+++ b/capa/features/extractors/vmray/models.py
@@ -0,0 +1,67 @@
+# Copyright (C) 2024 Mandiant, Inc. All Rights Reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at: [package root]/LICENSE.txt
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+#  is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and limitations under the License.
+
+from typing import Any, Dict, List, Union, Literal, Optional
+from pydantic_xml import BaseXmlModel, attr, element
+
+#
+class Param(BaseXmlModel):
+    name: str = attr()
+    type: str = attr()
+    value: str = attr()
+
+
+class FunctionCall(BaseXmlModel, tag="fncall"):
+    ts: int = attr()
+    fncall_id: int = attr()
+    process_id: int = attr()
+    name: str = attr() #API call name?
+    address: str = attr() #address
+    from_: str = attr() 
+    in_: List[Param] = element(name="in")
+    out: Optional[Param] = element(name="out")
+
+class FunctionReturn(BaseXmlModel, tag="fnret"):
+    ts: int = attr()
+    fncall_id: int = attr()
+    addr: str = attr() #string that contains a hex value
+    from_: str = attr #string that contains a hex value
+
+class MonitorProcess(BaseXmlModel, tag="monitor_process"):
+    ts: int = attr()
+    process_id: int = attr()
+    image_name: str = attr()
+
+
+class MonitorThread(BaseXmlModel, tag="monitor_thread"):
+    ts: int = attr()
+    thread_id: int = attr()
+    process_id: int = attr()
+    os_tid: str = attr()  # TODO hex
+
+class NewRegion(BaseXmlModel):
+    ts: int = attr()
+    start_va: str = attr()
+    end_va: str = attr()
+    entry_point: str = attr()
+
+class RemoveRegion(BaseXmlModel, tag="remove_region"):
+    ts: int = attr()
+    region_id: int = attr()
+
+class Analysis(BaseXmlModel, tag="analysis"):
+    log_version: str = attr()
+    analyzer_version: str = attr()
+    analysis_date: str = attr()
+    processes: List[MonitorProcess] = element(tag="monitor_process")
+    threads: List[MonitorThread] = element(tag="monitor_thread")
+    new_regions: List[NewRegion] = element(tag="new_region")
+    remove_regions: List[RemoveRegion] = element(tag="remove_region")
+    fncalls: List[FunctionCall] = element(tag="fncall")
+    fnrets: List[FunctionReturn] = element(tag="fnret")
+   
\ No newline at end of file

From 3cca80860d838ba174021849a2165ccd1ad8ad9a Mon Sep 17 00:00:00 2001
From: r-sm2024 <moonsunghyun96@gmail.com>
Date: Tue, 18 Jun 2024 21:30:24 +0000
Subject: [PATCH 019/105] Add VMRayanalysis model and call parser

---
 capa/features/extractors/vmray/models.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/capa/features/extractors/vmray/models.py b/capa/features/extractors/vmray/models.py
index d4adaead0..83db6f0a1 100644
--- a/capa/features/extractors/vmray/models.py
+++ b/capa/features/extractors/vmray/models.py
@@ -9,7 +9,7 @@
 from typing import Any, Dict, List, Union, Literal, Optional
 from pydantic_xml import BaseXmlModel, attr, element
 
-#
+##
 class Param(BaseXmlModel):
     name: str = attr()
     type: str = attr()

From 574d61ad8f5572bb0d881a191dd2a77ba7774f61 Mon Sep 17 00:00:00 2001
From: r-sm2024 <moonsunghyun96@gmail.com>
Date: Tue, 18 Jun 2024 21:33:50 +0000
Subject: [PATCH 020/105] Add VMRayanalysis model and call parser

---
 capa/features/extractors/vmray/models.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/capa/features/extractors/vmray/models.py b/capa/features/extractors/vmray/models.py
index 83db6f0a1..d4adaead0 100644
--- a/capa/features/extractors/vmray/models.py
+++ b/capa/features/extractors/vmray/models.py
@@ -9,7 +9,7 @@
 from typing import Any, Dict, List, Union, Literal, Optional
 from pydantic_xml import BaseXmlModel, attr, element
 
-##
+#
 class Param(BaseXmlModel):
     name: str = attr()
     type: str = attr()

From 85a85e99bfe214280f5de3e481df0b1d30bbabfe Mon Sep 17 00:00:00 2001
From: Mike Hunhoff <mike.hunhoff@gmail.com>
Date: Tue, 18 Jun 2024 15:38:44 -0600
Subject: [PATCH 021/105] vmray: emit recorded artifacts as strings

---
 capa/features/extractors/vmray/file.py   | 34 +++++++++++++-
 capa/features/extractors/vmray/models.py | 56 +++++++++++++++++++-----
 2 files changed, 76 insertions(+), 14 deletions(-)

diff --git a/capa/features/extractors/vmray/file.py b/capa/features/extractors/vmray/file.py
index 5a28b4722..2c6463c10 100644
--- a/capa/features/extractors/vmray/file.py
+++ b/capa/features/extractors/vmray/file.py
@@ -9,8 +9,8 @@
 from typing import Dict, Tuple, Iterator
 
 from capa.features.file import Export, Section
-from capa.features.common import Feature
-from capa.features.address import Address, ProcessAddress, AbsoluteVirtualAddress
+from capa.features.common import String, Feature
+from capa.features.address import NO_ADDRESS, Address, ProcessAddress, AbsoluteVirtualAddress
 from capa.features.extractors.vmray import VMRayAnalysis
 from capa.features.extractors.vmray.models import Process
 from capa.features.extractors.base_extractor import ProcessHandle
@@ -44,6 +44,31 @@ def extract_section_names(analysis: VMRayAnalysis) -> Iterator[Tuple[Feature, Ad
         yield Section(name), AbsoluteVirtualAddress(addr)
 
 
+def extract_referenced_filenames(analysis: VMRayAnalysis) -> Iterator[Tuple[Feature, Address]]:
+    for _, filename in analysis.sv2.filenames.items():
+        yield String(filename.filename), NO_ADDRESS
+
+
+def extract_referenced_mutex_names(analysis: VMRayAnalysis) -> Iterator[Tuple[Feature, Address]]:
+    for _, mutex in analysis.sv2.mutexes.items():
+        yield String(mutex.name), NO_ADDRESS
+
+
+def extract_referenced_domain_names(analysis: VMRayAnalysis) -> Iterator[Tuple[Feature, Address]]:
+    for _, domain in analysis.sv2.domains.items():
+        yield String(domain.domain), NO_ADDRESS
+
+
+def extract_referenced_ip_addresses(analysis: VMRayAnalysis) -> Iterator[Tuple[Feature, Address]]:
+    for _, ip_address in analysis.sv2.ip_addresses.items():
+        yield String(ip_address.ip_address), NO_ADDRESS
+
+
+def extract_referenced_registry_key_names(analysis: VMRayAnalysis) -> Iterator[Tuple[Feature, Address]]:
+    for _, registry_record in analysis.sv2.registry_records.items():
+        yield String(registry_record.reg_key_name), NO_ADDRESS
+
+
 def extract_features(analysis: VMRayAnalysis) -> Iterator[Tuple[Feature, Address]]:
     for handler in FILE_HANDLERS:
         for feature, addr in handler(analysis):
@@ -54,5 +79,10 @@ def extract_features(analysis: VMRayAnalysis) -> Iterator[Tuple[Feature, Address
     extract_import_names,
     extract_export_names,
     extract_section_names,
+    extract_referenced_filenames,
+    extract_referenced_mutex_names,
+    extract_referenced_domain_names,
+    extract_referenced_ip_addresses,
+    extract_referenced_registry_key_names,
     # extract_file_strings,
 )
diff --git a/capa/features/extractors/vmray/models.py b/capa/features/extractors/vmray/models.py
index 4ee6e9e95..8b910bfef 100644
--- a/capa/features/extractors/vmray/models.py
+++ b/capa/features/extractors/vmray/models.py
@@ -151,14 +151,40 @@ class Process(BaseModel):
     ref_parent_process: Optional[GenericReference] = None
 
 
-class Artifacts(BaseModel):
-    ref_processes: List[GenericReference] = []
-    ref_domains: List[GenericReference] = []
-    ref_filenames: List[GenericReference] = []
-    ref_files: List[GenericReference] = []
-    ref_ip_addresses: List[GenericReference] = []
-    ref_mutexes: List[GenericReference] = []
-    ref_registry_records: List[GenericReference] = []
+class Filename(BaseModel):
+    filename: str
+    is_artifact: bool
+    is_ioc: bool
+    verdict: str
+
+
+class Mutex(BaseModel):
+    name: str
+    is_artifact: bool
+    is_ioc: bool
+    verdict: str
+
+
+class Registry(BaseModel):
+    reg_key_name: str
+    reg_key_value_type: Optional[str] = None
+    is_artifact: bool
+    is_ioc: bool
+    verdict: str
+
+
+class Domain(BaseModel):
+    domain: str
+    is_artifact: bool
+    is_ioc: bool
+    verdict: str
+
+
+class IPAddress(BaseModel):
+    ip_address: str
+    is_artifact: bool
+    is_ioc: bool
+    verdict: str
 
 
 class AnalysisMetadata(BaseModel):
@@ -168,8 +194,14 @@ class AnalysisMetadata(BaseModel):
 
 class SummaryV2(BaseModel):
     analysis_metadata: AnalysisMetadata
-    artifacts: Artifacts
 
-    files: Dict[str, File]
-    static_data: Dict[str, StaticData]
-    processes: Dict[str, Process]
+    static_data: Dict[str, StaticData] = {}
+
+    # recorded artifacts
+    files: Dict[str, File] = {}
+    processes: Dict[str, Process] = {}
+    filenames: Dict[str, Filename] = {}
+    mutexes: Dict[str, Mutex] = {}
+    domains: Dict[str, Domain] = {}
+    ip_addresses: Dict[str, IPAddress] = {}
+    registry_records: Dict[str, Registry] = {}

From a544aed55250237dac7d6343bd5b9fd8b7888738 Mon Sep 17 00:00:00 2001
From: mr-tz <moritz.raabe@mandiant.com>
Date: Wed, 19 Jun 2024 14:49:12 +0000
Subject: [PATCH 022/105] add vmray-extractor branch for tests

---
 .github/workflows/tests.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
index eb8ec1ced..00bcbbe7f 100644
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -2,9 +2,9 @@ name: CI
 
 on:
   push:
-    branches: [ master ]
+    branches: [ master, vmray-extractor ]
   pull_request:
-    branches: [ master ]
+    branches: [ master, vmray-extractor ]
 
 permissions: read-all
 

From d10b396300b8d735ed8b71de57ee3af1fd17dfd2 Mon Sep 17 00:00:00 2001
From: mr-tz <moritz.raabe@mandiant.com>
Date: Wed, 19 Jun 2024 14:50:00 +0000
Subject: [PATCH 023/105] add pydantic-xml dependency

---
 pyproject.toml   | 1 +
 requirements.txt | 1 +
 2 files changed, 2 insertions(+)

diff --git a/pyproject.toml b/pyproject.toml
index 3afcb54d3..178d8cbc4 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -79,6 +79,7 @@ dependencies = [
     "rich>=13",
     "humanize>=4",
     "protobuf>=5",
+    "pydantic_xml[lxml]>=2.11",  # TODO benchmark lxml vs. elementtree - first impression eltree faster
 
     # ---------------------------------------
     # Dependencies that we develop
diff --git a/requirements.txt b/requirements.txt
index b667e63a7..b2465502e 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -28,6 +28,7 @@ pyasn1-modules==0.2.8
 pycparser==2.22
 pydantic==2.7.3
 pydantic-core==2.18.4
+pydantic-xml==2.11.0
 pyelftools==0.31
 pygments==2.18.0
 python-flirt==0.8.10

From 453a640de9faf558905c5ac94f0eba7e176ac83f Mon Sep 17 00:00:00 2001
From: mr-tz <moritz.raabe@mandiant.com>
Date: Wed, 19 Jun 2024 14:55:43 +0000
Subject: [PATCH 024/105] formatting

---
 capa/features/extractors/vmray/call.py | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/capa/features/extractors/vmray/call.py b/capa/features/extractors/vmray/call.py
index f6a3167ad..8a2475642 100644
--- a/capa/features/extractors/vmray/call.py
+++ b/capa/features/extractors/vmray/call.py
@@ -5,7 +5,7 @@
 from capa.features.insn import API, Number
 from capa.features.common import String, Feature
 from capa.features.address import Address
-from capa.features.extractors.vmray.models import FunctionCall, Analysis
+from capa.features.extractors.vmray.models import Analysis, FunctionCall
 from capa.features.extractors.base_extractor import CallHandle, ThreadHandle, ProcessHandle
 
 logger = logging.getLogger(__name__)
@@ -18,12 +18,12 @@ def extract_function_calls(ph: ProcessHandle, th: ThreadHandle, ch: CallHandle)
 
     args:
       call: FunctionCall object representing the XML fncall element
-      
+
       yields: Feature, address; where Feature is either: API, Number, or String.
     """
 
     # Extract API name
-    yield API(ch.inner.name), ch.inner.address  
+    yield API(ch.inner.name), ch.inner.address
 
     # Extract arguments from <in>
     for param in ch.inner.in_:
@@ -49,9 +49,10 @@ def extract_function_calls(ph: ProcessHandle, th: ThreadHandle, ch: CallHandle)
         else:
             assert_never(value)
 
+
 def extract_features(analysis: Analysis) -> Iterator[Tuple[Feature, Address]]:
-    '''
+    """
     Extract features from the Analysis object in models.py
-    '''
+    """
     for fncall in analysis.fncalls:
-        yield from extract_function_calls(fncall)
\ No newline at end of file
+        yield from extract_function_calls(fncall)

From fbdfea1edcb5676784760d1ee7f0ab6efc1fab16 Mon Sep 17 00:00:00 2001
From: mr-tz <moritz.raabe@mandiant.com>
Date: Wed, 19 Jun 2024 14:56:12 +0000
Subject: [PATCH 025/105] add testing code

---
 capa/features/extractors/vmray/extractor.py | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/capa/features/extractors/vmray/extractor.py b/capa/features/extractors/vmray/extractor.py
index 19eb3b61e..791ea89e9 100644
--- a/capa/features/extractors/vmray/extractor.py
+++ b/capa/features/extractors/vmray/extractor.py
@@ -11,6 +11,8 @@
 from pathlib import Path
 from zipfile import ZipFile
 
+from devtools import debug, pprint
+
 import capa.helpers
 import capa.features.extractors.vmray.file
 import capa.features.extractors.vmray.global_
@@ -97,4 +99,14 @@ def from_zipfile(cls, zipfile_path: Path):
             flog_xml = zipfile.read("logs/flog.xml", pwd=b"infected")
             flog = Analysis.from_xml(flog_xml)
 
+            # debug(flog)
+            pprint(flog.processes[0])
+
         return cls(VMRayAnalysis(sv2, flog))
+
+
+if __name__ == "__main__":
+    # TODO(mr): for testing, removeme
+    import sys
+    input_path = Path(sys.argv[1])
+    VMRayExtractor.from_zipfile(input_path)

From d256cc867f9dd2e92660da69df6a04fdce56477f Mon Sep 17 00:00:00 2001
From: mr-tz <moritz.raabe@mandiant.com>
Date: Wed, 19 Jun 2024 14:57:05 +0000
Subject: [PATCH 026/105] update model and re-add summary_v2.json models

---
 capa/features/extractors/vmray/models.py | 222 +++++++++++++++++++++--
 1 file changed, 207 insertions(+), 15 deletions(-)

diff --git a/capa/features/extractors/vmray/models.py b/capa/features/extractors/vmray/models.py
index 5d6e2edf4..60398fa21 100644
--- a/capa/features/extractors/vmray/models.py
+++ b/capa/features/extractors/vmray/models.py
@@ -7,60 +7,252 @@
 # See the License for the specific language governing permissions and limitations under the License.
 
 from typing import Any, Dict, List, Union, Literal, Optional
+
+from pydantic import Field, BaseModel
 from pydantic_xml import BaseXmlModel, attr, element
 
-#
-class Param(BaseXmlModel):
+
+### models for flog.xml
+class Param(BaseXmlModel, tag="param"):
     name: str = attr()
     type: str = attr()
-    value: str = attr()
+    value: Optional[str] = attr(default=None)
+
+
+# or see https://pydantic-xml.readthedocs.io/en/latest/pages/quickstart.html#wrapper
+class In(BaseXmlModel, tag="in"):
+    params: List[Param] = element(name="in")
+
+
+class Out(BaseXmlModel, tag="out"):
+    params: List[Param] = element(name="out")
 
 
 class FunctionCall(BaseXmlModel, tag="fncall"):
     ts: int = attr()
     fncall_id: int = attr()
     process_id: int = attr()
-    name: str = attr() #API call name?
-    address: str = attr() #address
-    from_: str = attr() 
-    in_: List[Param] = element(name="in")
-    out: Optional[Param] = element(name="out")
+    thread_id: int = attr()
+    name: str = attr()  # API call name?
+    address: str = attr(name="addr")
+    from_: str = attr(name="from")
+    in_: Optional[In] = element(tag="in", default=None)
+    out_: Optional[Out] = element(tag="out", default=None)
+
 
+# note that not all fncalls always have an associated fnret, e.g. exit or WaitForSingleObject
 class FunctionReturn(BaseXmlModel, tag="fnret"):
     ts: int = attr()
     fncall_id: int = attr()
-    addr: str = attr() #string that contains a hex value
-    from_: str = attr #string that contains a hex value
+    address: str = attr(name="addr")  # string that contains a hex value
+    from_: str = attr(name="from")  # string that contains a hex value
 
+
+# TODO check multiple are there
 class MonitorProcess(BaseXmlModel, tag="monitor_process"):
     ts: int = attr()
     process_id: int = attr()
     image_name: str = attr()
 
 
+# TODO check multiple are there
 class MonitorThread(BaseXmlModel, tag="monitor_thread"):
     ts: int = attr()
     thread_id: int = attr()
     process_id: int = attr()
     os_tid: str = attr()  # TODO hex
 
-class NewRegion(BaseXmlModel):
+
+class NewRegion(BaseXmlModel, tag="new_region"):
     ts: int = attr()
+    region_id: int = attr()
+    process_id: int = attr()
     start_va: str = attr()
     end_va: str = attr()
     entry_point: str = attr()
 
+
 class RemoveRegion(BaseXmlModel, tag="remove_region"):
     ts: int = attr()
     region_id: int = attr()
 
-class Analysis(BaseXmlModel, tag="analysis"):
+
+# unordered is very slow, but elements may occur in any order
+class Analysis(BaseXmlModel, tag="analysis", search_mode="unordered"):
     log_version: str = attr()
     analyzer_version: str = attr()
     analysis_date: str = attr()
+
+    # super slow
+    # data: List[Union[MonitorProcess, MonitorThread, NewRegion, RemoveRegion, FunctionCall, FunctionReturn]]
+
+    # may want to preprocess file and remove/reorder entries for more efficient parsing
+
     processes: List[MonitorProcess] = element(tag="monitor_process")
     threads: List[MonitorThread] = element(tag="monitor_thread")
-    new_regions: List[NewRegion] = element(tag="new_region")
-    remove_regions: List[RemoveRegion] = element(tag="remove_region")
+
+    # not important and slow down parsing
+    # new_regions: List[NewRegion] = element(tag="new_region")
+    # remove_regions: List[RemoveRegion] = element(tag="remove_region")
+
+    # very slow alternative; calls: List[Union[FunctionCall, FunctionReturn]]
     fncalls: List[FunctionCall] = element(tag="fncall")
-    fnrets: List[FunctionReturn] = element(tag="fnret")
\ No newline at end of file
+    fnrets: List[FunctionReturn] = element(tag="fnret")
+
+
+### models for summary_v2.json files
+class GenericReference(BaseModel):
+    path: List[str]
+    source: str
+
+
+class StaticDataReference(GenericReference): ...
+
+
+class PEFileBasicInfo(BaseModel):
+    compile_time: str
+    file_type: str
+    image_base: int
+    machine_type: str
+    size_of_code: int
+    size_of_initialized_data: int
+    size_of_uninitialized_data: int
+    subsystem: str
+    entry_point: int
+    imphash: Optional[str] = None
+
+
+class API(BaseModel):
+    name: str
+    ordinal: Optional[int] = None
+
+
+class PEFileExport(BaseModel):
+    address: int
+    api: API
+
+
+class PEFileImport(BaseModel):
+    address: int
+    api: API
+    thunk_offset: int
+    hint: Optional[int] = None
+    thunk_rva: int
+
+
+class PEFileImportModule(BaseModel):
+    dll: str
+    apis: List[PEFileImport]
+
+
+class PEFileSection(BaseModel):
+    entropy: float
+    flags: List[str] = []
+    name: str
+    raw_data_offset: int
+    raw_data_size: int
+    virtual_address: int
+    virtual_size: int
+
+
+class PEFile(BaseModel):
+    basic_info: PEFileBasicInfo
+    exports: List[PEFileExport] = []
+    imports: List[PEFileImportModule] = []
+    sections: List[PEFileSection] = []
+
+
+class StaticData(BaseModel):
+    pe: Optional[PEFile] = None
+
+
+class FileHashes(BaseModel):
+    md5: str
+    sha1: str
+    sha256: str
+    ssdeep: str
+
+
+class File(BaseModel):
+    categories: List[str]
+    hash_values: FileHashes
+    is_artifact: bool
+    is_ioc: bool
+    is_sample: bool
+    size: int
+    is_truncated: bool
+    mime_type: Optional[str] = None
+    operations: List[str] = []
+    ref_filenames: List[GenericReference] = []
+    ref_gfncalls: List[GenericReference] = []
+    ref_static_data: Optional[StaticDataReference] = None
+    ref_vti_matches: List[GenericReference] = []
+    verdict: str
+
+
+class Process(BaseModel):
+    bitness: int
+    is_artifact: bool
+    is_ioc: bool
+    monitor_id: int
+    monitor_reason: str
+    os_pid: int
+    filename: str
+    image_name: str
+    ref_parent_process: Optional[GenericReference] = None
+
+
+class Filename(BaseModel):
+    filename: str
+    is_artifact: bool
+    is_ioc: bool
+    verdict: str
+
+
+class Mutex(BaseModel):
+    name: str
+    is_artifact: bool
+    is_ioc: bool
+    verdict: str
+
+
+class Registry(BaseModel):
+    reg_key_name: str
+    reg_key_value_type: Optional[str] = None
+    is_artifact: bool
+    is_ioc: bool
+    verdict: str
+
+
+class Domain(BaseModel):
+    domain: str
+    is_artifact: bool
+    is_ioc: bool
+    verdict: str
+
+
+class IPAddress(BaseModel):
+    ip_address: str
+    is_artifact: bool
+    is_ioc: bool
+    verdict: str
+
+
+class AnalysisMetadata(BaseModel):
+    sample_type: str
+    submission_filename: str
+
+
+class SummaryV2(BaseModel):
+    analysis_metadata: AnalysisMetadata
+
+    static_data: Dict[str, StaticData] = {}
+
+    # recorded artifacts
+    files: Dict[str, File] = {}
+    processes: Dict[str, Process] = {}
+    filenames: Dict[str, Filename] = {}
+    mutexes: Dict[str, Mutex] = {}
+    domains: Dict[str, Domain] = {}
+    ip_addresses: Dict[str, IPAddress] = {}
+    registry_records: Dict[str, Registry] = {}

From 740c73935676925c212cdf0117b96676fc616986 Mon Sep 17 00:00:00 2001
From: mr-tz <moritz.raabe@mandiant.com>
Date: Wed, 19 Jun 2024 15:09:34 +0000
Subject: [PATCH 027/105] remove file

---
 vmray_parser.py | 138 ------------------------------------------------
 1 file changed, 138 deletions(-)
 delete mode 100644 vmray_parser.py

diff --git a/vmray_parser.py b/vmray_parser.py
deleted file mode 100644
index cd52e0d8e..000000000
--- a/vmray_parser.py
+++ /dev/null
@@ -1,138 +0,0 @@
-import re
-import json
-import argparse
-
-from datetime import datetime
-
-class VMrayParser:
-
-    def read_vmray_log(self):
-        with open(self.filename, 'r') as f:
-            lines = f.readlines()
-        return lines
-
-    def __init__(self, filename, output_filename):
-        self.filename = filename
-        self.output_filename = output_filename
-        self.data = {}
-        self.processes = []  
-        self.current_process = None  
-        self.threads = []  
-
-    #Parse info section of VMray output
-    def parse_info(self, lines):
-        info_data = {}
-        for line in lines:
-            if line.startswith("# Analyzer Version:"):
-                info_data["analyzer_version"] = int(line.split(":")[1].strip().replace(".", ""))
-            elif line.startswith("# Analyzer Build Date:" ):
-                info_data["analyzer_build_date"] = datetime.strptime(line.split(":",1)[1].strip(),"%b %d %Y %H:%M:%S").isoformat()
-            elif line.startswith("# Log Creation Date:"):
-                info_data["log_create_date"] = datetime.strptime(line.split(":",1)[1].strip(), "%d.%m.%Y %H:%M:%S.%f").isoformat()
-        self.data["info"] = info_data
-
-    #Parse process data 
-    def parse_process(self, lines):
-
-        process_data = {}
-        
-
-        for line in lines:
-
-            #Match key:value format for the process section
-            ####Maybe since the process section puts ints in quotations, we can filter by that? Thread section doesn't.
-            
-            matches = re.findall(r"\s+(.+?) = \"(.*?)\"", line) #old r"\s+(.+?) = (.*)"
-            
-            
-            for match in matches:
-                key = match[0]
-                
-                if match[1]:
-                    value = match[1]
-                elif match[2]:
-                    value = match[2]
-
-                process_data[key.strip()] = value.strip()
-            
-
-        self.processes.append(process_data)  # Append to the list of processes
-    
-
-    def parse_thread(self, lines):
-        thread_data = {}
-        thread_calls = []
-        current_thread_id = None
-
-        #Start parsing thread section for id, os_id, and api calls
-
-        for line in lines:
-            if line.startswith("\tid ="):
-                    current_thread_id = int(line.split("=")[1].strip().strip('"'))
-                    thread_data["id"] = current_thread_id
-
-            elif line.startswith("\tos_tid ="):
-                    thread_data["os_tid"] = line.split("=")[1].strip()
-
-            elif current_thread_id is not None and line.startswith("\t["):
-                #Check if line contains timestamp bracket 
-            
-            
-                    thread_calls.append(line.strip())
-
-                      # Append call_data to the list
-                
-
-        # Assign the call_data dictionary with the thread_calls list?
-        thread_data["calls"] = thread_calls 
-        
-        # Append thread_data to the list of threads
-        self.threads.append(thread_data) 
-        return thread_data
-        
-    def write_json_file(self):
-                
-        self.data["process"] = self.processes  # Add the list of processes to the main dictionary
-        self.data["threads"] = self.threads  # Add the list of threads to the main dictionary
-        with open(self.output_filename, 'w') as file:
-                    json.dump(self.data, file, indent=4)
-
-    def convert(self):
-        lines = self.read_vmray_log()
-        self.parse_info(lines)
-
-        self.current_process = None  # Set current_process to None at the start of convert
-        current_section = None
-        current_section_lines = []
-        for line in lines:
-            if line.startswith("Process:"):
-                current_section = "process"
-                # Parse the process data immediately
-                self.parse_process(current_section_lines)  # Parse process data when encountering "Process"
-                current_section_lines = [line]
-            elif line.startswith("Thread:"):
-                current_section = "thread"
-                if current_section_lines:
-                    self.parse_thread(current_section_lines)  # Parse thread when encountering "Thread"
-                current_section_lines = [line]
-            else:
-                current_section_lines.append(line)
-
-        if current_section_lines:
-            if current_section == "process":
-                self.parse_process(current_section_lines)
-            elif current_section == "thread":
-                self.parse_thread(current_section_lines)
-        self.write_json_file()
-        print(json.dumps(self.data, indent=4)) 
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser(description="Convert VMray log files to JSON.")
-    parser.add_argument("input_file", help="The path to the VMray log file")
-    parser.add_argument("-o", "--output_file", default="vmray_output.json", help="The path to the output JSON file")
-
-    args = parser.parse_args()
-
-    vmray_parser = VMrayParser(args.input_file, args.output_file)
-    vmray_parser.convert()
-    print(f"Your VMray flog file '{args.input_file}' was converted to JSON and saved to '{args.output_file}'.")
\ No newline at end of file

From 0c9d3d09af583478554f2ddf1c7a9c56e07f99a0 Mon Sep 17 00:00:00 2001
From: mr-tz <moritz.raabe@mandiant.com>
Date: Wed, 19 Jun 2024 15:12:52 +0000
Subject: [PATCH 028/105] fix ruff

---
 capa/features/extractors/vmray/call.py      | 2 +-
 capa/features/extractors/vmray/extractor.py | 3 ++-
 capa/features/extractors/vmray/models.py    | 4 ++--
 3 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/capa/features/extractors/vmray/call.py b/capa/features/extractors/vmray/call.py
index 8a2475642..1a3bb9e60 100644
--- a/capa/features/extractors/vmray/call.py
+++ b/capa/features/extractors/vmray/call.py
@@ -5,7 +5,7 @@
 from capa.features.insn import API, Number
 from capa.features.common import String, Feature
 from capa.features.address import Address
-from capa.features.extractors.vmray.models import Analysis, FunctionCall
+from capa.features.extractors.vmray.models import Analysis
 from capa.features.extractors.base_extractor import CallHandle, ThreadHandle, ProcessHandle
 
 logger = logging.getLogger(__name__)
diff --git a/capa/features/extractors/vmray/extractor.py b/capa/features/extractors/vmray/extractor.py
index 791ea89e9..73f0757d9 100644
--- a/capa/features/extractors/vmray/extractor.py
+++ b/capa/features/extractors/vmray/extractor.py
@@ -99,7 +99,7 @@ def from_zipfile(cls, zipfile_path: Path):
             flog_xml = zipfile.read("logs/flog.xml", pwd=b"infected")
             flog = Analysis.from_xml(flog_xml)
 
-            # debug(flog)
+            debug(flog.processes[1])
             pprint(flog.processes[0])
 
         return cls(VMRayAnalysis(sv2, flog))
@@ -108,5 +108,6 @@ def from_zipfile(cls, zipfile_path: Path):
 if __name__ == "__main__":
     # TODO(mr): for testing, removeme
     import sys
+
     input_path = Path(sys.argv[1])
     VMRayExtractor.from_zipfile(input_path)
diff --git a/capa/features/extractors/vmray/models.py b/capa/features/extractors/vmray/models.py
index 60398fa21..dc6681517 100644
--- a/capa/features/extractors/vmray/models.py
+++ b/capa/features/extractors/vmray/models.py
@@ -6,9 +6,9 @@
 #  is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and limitations under the License.
 
-from typing import Any, Dict, List, Union, Literal, Optional
+from typing import Dict, List, Optional
 
-from pydantic import Field, BaseModel
+from pydantic import BaseModel
 from pydantic_xml import BaseXmlModel, attr, element
 
 

From 5be68d07519496bc8ca56f7f427a6f5fd45e9e0d Mon Sep 17 00:00:00 2001
From: Mike Hunhoff <mike.hunhoff@gmail.com>
Date: Thu, 20 Jun 2024 08:20:00 -0600
Subject: [PATCH 029/105] vmray: remove debug code and update call features
 entry point

---
 capa/features/extractors/vmray/call.py      | 15 ++++++++-------
 capa/features/extractors/vmray/extractor.py | 13 -------------
 2 files changed, 8 insertions(+), 20 deletions(-)

diff --git a/capa/features/extractors/vmray/call.py b/capa/features/extractors/vmray/call.py
index 1a3bb9e60..e298c4ad4 100644
--- a/capa/features/extractors/vmray/call.py
+++ b/capa/features/extractors/vmray/call.py
@@ -11,7 +11,7 @@
 logger = logging.getLogger(__name__)
 
 
-def extract_function_calls(ph: ProcessHandle, th: ThreadHandle, ch: CallHandle) -> Iterator[Tuple[Feature, Address]]:
+def extract_call_features(ph: ProcessHandle, th: ThreadHandle, ch: CallHandle) -> Iterator[Tuple[Feature, Address]]:
     """
     this method extracts the given call's features (such as API name and arguments),
     and returns them as API, Number, and String features.
@@ -50,9 +50,10 @@ def extract_function_calls(ph: ProcessHandle, th: ThreadHandle, ch: CallHandle)
             assert_never(value)
 
 
-def extract_features(analysis: Analysis) -> Iterator[Tuple[Feature, Address]]:
-    """
-    Extract features from the Analysis object in models.py
-    """
-    for fncall in analysis.fncalls:
-        yield from extract_function_calls(fncall)
+def extract_features(ph: ProcessHandle, th: ThreadHandle, ch: CallHandle) -> Iterator[Tuple[Feature, Address]]:
+    for handler in CALL_HANDLERS:
+        for feature, addr in handler(ph, th, ch):
+            yield feature, addr
+
+
+CALL_HANDLERS = (extract_call_features,)
diff --git a/capa/features/extractors/vmray/extractor.py b/capa/features/extractors/vmray/extractor.py
index 73f0757d9..19eb3b61e 100644
--- a/capa/features/extractors/vmray/extractor.py
+++ b/capa/features/extractors/vmray/extractor.py
@@ -11,8 +11,6 @@
 from pathlib import Path
 from zipfile import ZipFile
 
-from devtools import debug, pprint
-
 import capa.helpers
 import capa.features.extractors.vmray.file
 import capa.features.extractors.vmray.global_
@@ -99,15 +97,4 @@ def from_zipfile(cls, zipfile_path: Path):
             flog_xml = zipfile.read("logs/flog.xml", pwd=b"infected")
             flog = Analysis.from_xml(flog_xml)
 
-            debug(flog.processes[1])
-            pprint(flog.processes[0])
-
         return cls(VMRayAnalysis(sv2, flog))
-
-
-if __name__ == "__main__":
-    # TODO(mr): for testing, removeme
-    import sys
-
-    input_path = Path(sys.argv[1])
-    VMRayExtractor.from_zipfile(input_path)

From ec21f3b3fc2802fa067c48813e764b56f19c596a Mon Sep 17 00:00:00 2001
From: Mike Hunhoff <mike.hunhoff@gmail.com>
Date: Thu, 20 Jun 2024 10:08:27 -0600
Subject: [PATCH 030/105] vmray: use xmltodict instead of pydantic_xml to
 improve performance

---
 capa/features/extractors/vmray/__init__.py  |   4 +-
 capa/features/extractors/vmray/call.py      |   1 -
 capa/features/extractors/vmray/extractor.py |   7 +-
 capa/features/extractors/vmray/models.py    | 104 +++++---------------
 requirements.txt                            |   2 +-
 5 files changed, 31 insertions(+), 87 deletions(-)

diff --git a/capa/features/extractors/vmray/__init__.py b/capa/features/extractors/vmray/__init__.py
index a05282785..3d2bf9d0b 100644
--- a/capa/features/extractors/vmray/__init__.py
+++ b/capa/features/extractors/vmray/__init__.py
@@ -8,11 +8,11 @@
 from typing import Dict
 
 from capa.exceptions import UnsupportedFormatError
-from capa.features.extractors.vmray.models import File, Analysis, SummaryV2, StaticData
+from capa.features.extractors.vmray.models import File, Flog, SummaryV2, StaticData
 
 
 class VMRayAnalysis:
-    def __init__(self, sv2: SummaryV2, flog: Analysis):
+    def __init__(self, sv2: SummaryV2, flog: Flog):
         self.sv2 = sv2  # logs/summary_v2.json
         self.flog = flog  # logs/flog.xml
         self.exports: Dict[int, str] = {}
diff --git a/capa/features/extractors/vmray/call.py b/capa/features/extractors/vmray/call.py
index e298c4ad4..c4d117d25 100644
--- a/capa/features/extractors/vmray/call.py
+++ b/capa/features/extractors/vmray/call.py
@@ -5,7 +5,6 @@
 from capa.features.insn import API, Number
 from capa.features.common import String, Feature
 from capa.features.address import Address
-from capa.features.extractors.vmray.models import Analysis
 from capa.features.extractors.base_extractor import CallHandle, ThreadHandle, ProcessHandle
 
 logger = logging.getLogger(__name__)
diff --git a/capa/features/extractors/vmray/extractor.py b/capa/features/extractors/vmray/extractor.py
index 19eb3b61e..094502005 100644
--- a/capa/features/extractors/vmray/extractor.py
+++ b/capa/features/extractors/vmray/extractor.py
@@ -11,13 +11,15 @@
 from pathlib import Path
 from zipfile import ZipFile
 
+import xmltodict
+
 import capa.helpers
 import capa.features.extractors.vmray.file
 import capa.features.extractors.vmray.global_
 from capa.features.common import Feature
 from capa.features.address import Address, AbsoluteVirtualAddress
 from capa.features.extractors.vmray import VMRayAnalysis
-from capa.features.extractors.vmray.models import Process, Analysis, SummaryV2
+from capa.features.extractors.vmray.models import Flog, Process, SummaryV2
 from capa.features.extractors.base_extractor import (
     CallHandle,
     SampleHashes,
@@ -95,6 +97,7 @@ def from_zipfile(cls, zipfile_path: Path):
             sv2 = SummaryV2.model_validate(sv2_json)
 
             flog_xml = zipfile.read("logs/flog.xml", pwd=b"infected")
-            flog = Analysis.from_xml(flog_xml)
+            flog_json = xmltodict.parse(flog_xml, attr_prefix="")
+            flog = Flog.model_validate(flog_json)
 
         return cls(VMRayAnalysis(sv2, flog))
diff --git a/capa/features/extractors/vmray/models.py b/capa/features/extractors/vmray/models.py
index dc6681517..5e0ddd48a 100644
--- a/capa/features/extractors/vmray/models.py
+++ b/capa/features/extractors/vmray/models.py
@@ -8,96 +8,38 @@
 
 from typing import Dict, List, Optional
 
-from pydantic import BaseModel
-from pydantic_xml import BaseXmlModel, attr, element
+from pydantic import Field, BaseModel
 
 
-### models for flog.xml
-class Param(BaseXmlModel, tag="param"):
-    name: str = attr()
-    type: str = attr()
-    value: Optional[str] = attr(default=None)
-
-
-# or see https://pydantic-xml.readthedocs.io/en/latest/pages/quickstart.html#wrapper
-class In(BaseXmlModel, tag="in"):
-    params: List[Param] = element(name="in")
-
-
-class Out(BaseXmlModel, tag="out"):
-    params: List[Param] = element(name="out")
-
-
-class FunctionCall(BaseXmlModel, tag="fncall"):
-    ts: int = attr()
-    fncall_id: int = attr()
-    process_id: int = attr()
-    thread_id: int = attr()
-    name: str = attr()  # API call name?
-    address: str = attr(name="addr")
-    from_: str = attr(name="from")
-    in_: Optional[In] = element(tag="in", default=None)
-    out_: Optional[Out] = element(tag="out", default=None)
-
-
-# note that not all fncalls always have an associated fnret, e.g. exit or WaitForSingleObject
-class FunctionReturn(BaseXmlModel, tag="fnret"):
-    ts: int = attr()
-    fncall_id: int = attr()
-    address: str = attr(name="addr")  # string that contains a hex value
-    from_: str = attr(name="from")  # string that contains a hex value
-
-
-# TODO check multiple are there
-class MonitorProcess(BaseXmlModel, tag="monitor_process"):
-    ts: int = attr()
-    process_id: int = attr()
-    image_name: str = attr()
-
-
-# TODO check multiple are there
-class MonitorThread(BaseXmlModel, tag="monitor_thread"):
-    ts: int = attr()
-    thread_id: int = attr()
-    process_id: int = attr()
-    os_tid: str = attr()  # TODO hex
-
-
-class NewRegion(BaseXmlModel, tag="new_region"):
-    ts: int = attr()
-    region_id: int = attr()
-    process_id: int = attr()
-    start_va: str = attr()
-    end_va: str = attr()
-    entry_point: str = attr()
-
-
-class RemoveRegion(BaseXmlModel, tag="remove_region"):
-    ts: int = attr()
-    region_id: int = attr()
+### models flog.xml files
+class FunctionCall(BaseModel):
+    ts: str
+    fncall_id: str
+    process_id: str
+    thread_id: str
+    name: str
+    addr: str
+    from_addr: str = Field(alias="from")
 
 
-# unordered is very slow, but elements may occur in any order
-class Analysis(BaseXmlModel, tag="analysis", search_mode="unordered"):
-    log_version: str = attr()
-    analyzer_version: str = attr()
-    analysis_date: str = attr()
+class FunctionReturn(BaseModel):
+    ts: str
+    fncall_id: str
+    addr: str
+    from_addr: str = Field(alias="from")
 
-    # super slow
-    # data: List[Union[MonitorProcess, MonitorThread, NewRegion, RemoveRegion, FunctionCall, FunctionReturn]]
 
-    # may want to preprocess file and remove/reorder entries for more efficient parsing
+class Analysis(BaseModel):
+    log_version: str
+    analyzer_version: str
+    analysis_date: str
 
-    processes: List[MonitorProcess] = element(tag="monitor_process")
-    threads: List[MonitorThread] = element(tag="monitor_thread")
+    function_calls: List[FunctionCall] = Field(alias="fncall", default=[])
+    function_returns: List[FunctionReturn] = Field(alias="fnret", default=[])
 
-    # not important and slow down parsing
-    # new_regions: List[NewRegion] = element(tag="new_region")
-    # remove_regions: List[RemoveRegion] = element(tag="remove_region")
 
-    # very slow alternative; calls: List[Union[FunctionCall, FunctionReturn]]
-    fncalls: List[FunctionCall] = element(tag="fncall")
-    fnrets: List[FunctionReturn] = element(tag="fnret")
+class Flog(BaseModel):
+    analysis: Analysis
 
 
 ### models for summary_v2.json files
diff --git a/requirements.txt b/requirements.txt
index b2465502e..f0a604085 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -28,7 +28,7 @@ pyasn1-modules==0.2.8
 pycparser==2.22
 pydantic==2.7.3
 pydantic-core==2.18.4
-pydantic-xml==2.11.0
+xmltodict==0.13.0
 pyelftools==0.31
 pygments==2.18.0
 python-flirt==0.8.10

From 19502efff3f861bfe30ca8281b3989f814c7644d Mon Sep 17 00:00:00 2001
From: Mike Hunhoff <mike.hunhoff@gmail.com>
Date: Thu, 20 Jun 2024 13:05:32 -0600
Subject: [PATCH 031/105] vmray: connect process, thread, and call

---
 capa/features/extractors/vmray/__init__.py  | 24 ++++++++++-
 capa/features/extractors/vmray/call.py      |  4 ++
 capa/features/extractors/vmray/extractor.py | 27 +++++++------
 capa/features/extractors/vmray/file.py      |  6 +--
 scripts/show-features.py                    | 44 ++++++++++-----------
 5 files changed, 66 insertions(+), 39 deletions(-)

diff --git a/capa/features/extractors/vmray/__init__.py b/capa/features/extractors/vmray/__init__.py
index 3d2bf9d0b..8fbfa9156 100644
--- a/capa/features/extractors/vmray/__init__.py
+++ b/capa/features/extractors/vmray/__init__.py
@@ -5,10 +5,11 @@
 # Unless required by applicable law or agreed to in writing, software distributed under the License
 #  is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and limitations under the License.
-from typing import Dict
+from typing import Dict, List
+from collections import defaultdict
 
 from capa.exceptions import UnsupportedFormatError
-from capa.features.extractors.vmray.models import File, Flog, SummaryV2, StaticData
+from capa.features.extractors.vmray.models import File, Flog, SummaryV2, StaticData, FunctionCall
 
 
 class VMRayAnalysis:
@@ -18,6 +19,8 @@ def __init__(self, sv2: SummaryV2, flog: Flog):
         self.exports: Dict[int, str] = {}
         self.imports: Dict[int, str] = {}
         self.sections: Dict[int, str] = {}
+        self.process_threads: Dict[int, List[int]] = defaultdict(list)
+        self.process_calls: Dict[int, Dict[int, List[FunctionCall]]] = defaultdict(lambda: defaultdict(list))
         self.base_address: int
 
         self.sample_file_name: str
@@ -28,6 +31,8 @@ def __init__(self, sv2: SummaryV2, flog: Flog):
         self._compute_base_address()
         self._compute_exports()
         self._compute_sections()
+        self._compute_process_threads()
+        self._compute_process_calls()
 
         if not self.sample_file_static_data.pe:
             raise UnsupportedFormatError("VMRay feature extractor only supports PE at this time")
@@ -61,3 +66,18 @@ def _compute_sections(self):
         if self.sample_file_static_data.pe:
             for section in self.sample_file_static_data.pe.sections:
                 self.sections[section.virtual_address] = section.name
+
+    def _compute_process_threads(self):
+        for function_call in self.flog.analysis.function_calls:
+            pid: int = int(function_call.process_id)
+            tid: int = int(function_call.thread_id)
+
+            if tid not in self.process_threads[pid]:
+                self.process_threads[pid].append(tid)
+
+    def _compute_process_calls(self):
+        for function_call in self.flog.analysis.function_calls:
+            pid: int = int(function_call.process_id)
+            tid: int = int(function_call.thread_id)
+
+            self.process_calls[pid][tid].append(function_call)
diff --git a/capa/features/extractors/vmray/call.py b/capa/features/extractors/vmray/call.py
index c4d117d25..c5f8d4469 100644
--- a/capa/features/extractors/vmray/call.py
+++ b/capa/features/extractors/vmray/call.py
@@ -21,6 +21,10 @@ def extract_call_features(ph: ProcessHandle, th: ThreadHandle, ch: CallHandle) -
       yields: Feature, address; where Feature is either: API, Number, or String.
     """
 
+    # TODO update for new models
+    # print(ch)
+    return
+
     # Extract API name
     yield API(ch.inner.name), ch.inner.address
 
diff --git a/capa/features/extractors/vmray/extractor.py b/capa/features/extractors/vmray/extractor.py
index 094502005..16607ac47 100644
--- a/capa/features/extractors/vmray/extractor.py
+++ b/capa/features/extractors/vmray/extractor.py
@@ -14,10 +14,11 @@
 import xmltodict
 
 import capa.helpers
+import capa.features.extractors.vmray.call
 import capa.features.extractors.vmray.file
 import capa.features.extractors.vmray.global_
-from capa.features.common import Feature
-from capa.features.address import Address, AbsoluteVirtualAddress
+from capa.features.common import Feature, Characteristic
+from capa.features.address import NO_ADDRESS, Address, ThreadAddress, DynamicCallAddress, AbsoluteVirtualAddress
 from capa.features.extractors.vmray import VMRayAnalysis
 from capa.features.extractors.vmray.models import Flog, Process, SummaryV2
 from capa.features.extractors.base_extractor import (
@@ -28,8 +29,6 @@
     DynamicFeatureExtractor,
 )
 
-# TODO also/or look into xmltodict?
-
 
 class VMRayExtractor(DynamicFeatureExtractor):
     def __init__(self, analysis: VMRayAnalysis):
@@ -68,23 +67,27 @@ def get_process_name(self, ph) -> str:
         return process.image_name
 
     def get_threads(self, ph: ProcessHandle) -> Iterator[ThreadHandle]:
-        # TODO (meh)
-        yield from []
+        for thread in self.analysis.process_threads[ph.address.pid]:
+            address: ThreadAddress = ThreadAddress(process=ph.address, tid=thread)
+            yield ThreadHandle(address=address, inner={})
 
     def extract_thread_features(self, ph: ProcessHandle, th: ThreadHandle) -> Iterator[Tuple[Feature, Address]]:
-        # force this routine to be a generator,
-        # but we don't actually have any elements to generate.
-        yield from []
+        if False:
+            # force this routine to be a generator,
+            # but we don't actually have any elements to generate.
+            yield Characteristic("never"), NO_ADDRESS
+        return
 
     def get_calls(self, ph: ProcessHandle, th: ThreadHandle) -> Iterator[CallHandle]:
-        # TODO (meh)
-        yield from []
+        for function_call in self.analysis.process_calls[ph.address.pid][th.address.tid]:
+            addr = DynamicCallAddress(thread=th.address, id=int(function_call.fncall_id))
+            yield CallHandle(address=addr, inner=function_call)
 
     def extract_call_features(
         self, ph: ProcessHandle, th: ThreadHandle, ch: CallHandle
     ) -> Iterator[Tuple[Feature, Address]]:
         # TODO (meh)
-        yield from []
+        yield from capa.features.extractors.vmray.call.extract_features(ph, th, ch)
 
     def get_call_name(self, ph, th, ch) -> str:
         # TODO (meh)
diff --git a/capa/features/extractors/vmray/file.py b/capa/features/extractors/vmray/file.py
index 2c6463c10..8344f394d 100644
--- a/capa/features/extractors/vmray/file.py
+++ b/capa/features/extractors/vmray/file.py
@@ -22,10 +22,10 @@ def get_processes(analysis: VMRayAnalysis) -> Iterator[ProcessHandle]:
     processes: Dict[str, Process] = analysis.sv2.processes
 
     for _, process in processes.items():
-        pid = process.os_pid
-        ppid = processes[process.ref_parent_process.path[1]].os_pid if process.ref_parent_process else 0
+        pid = process.monitor_id
+        ppid = processes[process.ref_parent_process.path[1]].monitor_id if process.ref_parent_process else 0
 
-        addr = ProcessAddress(pid=pid, ppid=ppid)
+        addr = ProcessAddress(pid=int(pid), ppid=int(ppid))
         yield ProcessHandle(address=addr, inner=process)
 
 
diff --git a/scripts/show-features.py b/scripts/show-features.py
index 30ad2a4be..b46432db8 100644
--- a/scripts/show-features.py
+++ b/scripts/show-features.py
@@ -235,34 +235,34 @@ def print_dynamic_features(processes, extractor: DynamicFeatureExtractor):
 
             print(f" proc: {extractor.get_process_name(p)}: {feature}")
 
-            for t in extractor.get_threads(p):
-                print(f"  thread: {t.address.tid}")
-                for feature, addr in extractor.extract_thread_features(p, t):
-                    if is_global_feature(feature):
-                        continue
+        for t in extractor.get_threads(p):
+            print(f"  thread: {t.address.tid}")
+            for feature, addr in extractor.extract_thread_features(p, t):
+                if is_global_feature(feature):
+                    continue
 
-                    if feature != Feature(0):
-                        print(f"   {format_address(addr)}: {feature}")
+                if feature != Feature(0):
+                    print(f"   {format_address(addr)}: {feature}")
 
-                for call in extractor.get_calls(p, t):
-                    apis = []
-                    arguments = []
-                    for feature, addr in extractor.extract_call_features(p, t, call):
-                        if is_global_feature(feature):
-                            continue
+            for call in extractor.get_calls(p, t):
+                apis = []
+                arguments = []
+                for feature, addr in extractor.extract_call_features(p, t, call):
+                    if is_global_feature(feature):
+                        continue
 
-                        if isinstance(feature, API):
-                            assert isinstance(addr, capa.features.address.DynamicCallAddress)
-                            apis.append((addr.id, str(feature.value)))
+                    if isinstance(feature, API):
+                        assert isinstance(addr, capa.features.address.DynamicCallAddress)
+                        apis.append((addr.id, str(feature.value)))
 
-                        if isinstance(feature, (Number, String)):
-                            arguments.append(str(feature.value))
+                    if isinstance(feature, (Number, String)):
+                        arguments.append(str(feature.value))
 
-                    if not apis:
-                        print(f"    arguments=[{', '.join(arguments)}]")
+                # if not apis:
+                #    print(f"    arguments=[{', '.join(arguments)}]")
 
-                    for cid, api in apis:
-                        print(f"    call {cid}: {api}({', '.join(arguments)})")
+                for cid, api in apis:
+                    print(f"    call {cid}: {api}({', '.join(arguments)})")
 
 
 def ida_main():

From 9ef705a9ac75a6cbb2003f8cd0f2a4b39136c386 Mon Sep 17 00:00:00 2001
From: Mike Hunhoff <mike.hunhoff@gmail.com>
Date: Thu, 20 Jun 2024 14:04:31 -0600
Subject: [PATCH 032/105] vmray: remove old comments

---
 capa/features/extractors/vmray/extractor.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/capa/features/extractors/vmray/extractor.py b/capa/features/extractors/vmray/extractor.py
index 16607ac47..5ddb3714f 100644
--- a/capa/features/extractors/vmray/extractor.py
+++ b/capa/features/extractors/vmray/extractor.py
@@ -86,7 +86,6 @@ def get_calls(self, ph: ProcessHandle, th: ThreadHandle) -> Iterator[CallHandle]
     def extract_call_features(
         self, ph: ProcessHandle, th: ThreadHandle, ch: CallHandle
     ) -> Iterator[Tuple[Feature, Address]]:
-        # TODO (meh)
         yield from capa.features.extractors.vmray.call.extract_features(ph, th, ch)
 
     def get_call_name(self, ph, th, ch) -> str:

From 544899a04e608486186944313498b25b7e4af7c7 Mon Sep 17 00:00:00 2001
From: Mike Hunhoff <mike.hunhoff@gmail.com>
Date: Thu, 20 Jun 2024 14:06:04 -0600
Subject: [PATCH 033/105] vmray: add os v. monitor id comment

---
 capa/features/extractors/vmray/file.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/capa/features/extractors/vmray/file.py b/capa/features/extractors/vmray/file.py
index 8344f394d..2b26f7d60 100644
--- a/capa/features/extractors/vmray/file.py
+++ b/capa/features/extractors/vmray/file.py
@@ -22,6 +22,7 @@ def get_processes(analysis: VMRayAnalysis) -> Iterator[ProcessHandle]:
     processes: Dict[str, Process] = analysis.sv2.processes
 
     for _, process in processes.items():
+        # TODO (meh) should we use the OS process ID or vmray-assigned ID?
         pid = process.monitor_id
         ppid = processes[process.ref_parent_process.path[1]].monitor_id if process.ref_parent_process else 0
 

From 4b08e6275085425acf94a65c94fe2aa3d987d40a Mon Sep 17 00:00:00 2001
From: Mike Hunhoff <mike.hunhoff@gmail.com>
Date: Thu, 20 Jun 2024 14:12:34 -0600
Subject: [PATCH 034/105] vmray: fix flake8 lints

---
 capa/features/extractors/vmray/__init__.py  | 2 +-
 capa/features/extractors/vmray/call.py      | 9 ++++++++-
 capa/features/extractors/vmray/extractor.py | 4 ++--
 capa/features/extractors/vmray/file.py      | 4 ++--
 capa/features/extractors/vmray/models.py    | 4 ++--
 scripts/show-features.py                    | 2 +-
 6 files changed, 16 insertions(+), 9 deletions(-)

diff --git a/capa/features/extractors/vmray/__init__.py b/capa/features/extractors/vmray/__init__.py
index 8fbfa9156..35c9c0fc9 100644
--- a/capa/features/extractors/vmray/__init__.py
+++ b/capa/features/extractors/vmray/__init__.py
@@ -59,7 +59,7 @@ def _compute_exports(self):
                 self.exports[export.address] = export.api.name
 
     def _compute_imports(self):
-        # TODO (meh)
+        # TODO (meh): https://github.com/mandiant/capa/issues/2148
         ...
 
     def _compute_sections(self):
diff --git a/capa/features/extractors/vmray/call.py b/capa/features/extractors/vmray/call.py
index c5f8d4469..a653e6602 100644
--- a/capa/features/extractors/vmray/call.py
+++ b/capa/features/extractors/vmray/call.py
@@ -1,3 +1,10 @@
+# Copyright (C) 2024 Mandiant, Inc. All Rights Reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at: [package root]/LICENSE.txt
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+#  is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and limitations under the License.
 import logging
 from typing import Tuple, Iterator
 
@@ -21,7 +28,7 @@ def extract_call_features(ph: ProcessHandle, th: ThreadHandle, ch: CallHandle) -
       yields: Feature, address; where Feature is either: API, Number, or String.
     """
 
-    # TODO update for new models
+    # TODO (meh): update for new models https://github.com/mandiant/capa/issues/2148
     # print(ch)
     return
 
diff --git a/capa/features/extractors/vmray/extractor.py b/capa/features/extractors/vmray/extractor.py
index 5ddb3714f..236ed9451 100644
--- a/capa/features/extractors/vmray/extractor.py
+++ b/capa/features/extractors/vmray/extractor.py
@@ -59,7 +59,7 @@ def get_processes(self) -> Iterator[ProcessHandle]:
         yield from capa.features.extractors.vmray.file.get_processes(self.analysis)
 
     def extract_process_features(self, ph: ProcessHandle) -> Iterator[Tuple[Feature, Address]]:
-        # TODO (meh)
+        # TODO (meh): https://github.com/mandiant/capa/issues/2148
         yield from []
 
     def get_process_name(self, ph) -> str:
@@ -89,7 +89,7 @@ def extract_call_features(
         yield from capa.features.extractors.vmray.call.extract_features(ph, th, ch)
 
     def get_call_name(self, ph, th, ch) -> str:
-        # TODO (meh)
+        # TODO (meh): https://github.com/mandiant/capa/issues/2148
         raise NotImplementedError()
 
     @classmethod
diff --git a/capa/features/extractors/vmray/file.py b/capa/features/extractors/vmray/file.py
index 2b26f7d60..45b0d0afa 100644
--- a/capa/features/extractors/vmray/file.py
+++ b/capa/features/extractors/vmray/file.py
@@ -22,7 +22,7 @@ def get_processes(analysis: VMRayAnalysis) -> Iterator[ProcessHandle]:
     processes: Dict[str, Process] = analysis.sv2.processes
 
     for _, process in processes.items():
-        # TODO (meh) should we use the OS process ID or vmray-assigned ID?
+        # TODO (meh): should we use the OS process ID or vmray-assigned ID? https://github.com/mandiant/capa/issues/2148
         pid = process.monitor_id
         ppid = processes[process.ref_parent_process.path[1]].monitor_id if process.ref_parent_process else 0
 
@@ -36,7 +36,7 @@ def extract_export_names(analysis: VMRayAnalysis) -> Iterator[Tuple[Feature, Add
 
 
 def extract_import_names(analysis: VMRayAnalysis) -> Iterator[Tuple[Feature, Address]]:
-    # TODO (meh)
+    # TODO (meh): https://github.com/mandiant/capa/issues/2148
     yield from []
 
 
diff --git a/capa/features/extractors/vmray/models.py b/capa/features/extractors/vmray/models.py
index 5e0ddd48a..22db5a1d6 100644
--- a/capa/features/extractors/vmray/models.py
+++ b/capa/features/extractors/vmray/models.py
@@ -11,7 +11,7 @@
 from pydantic import Field, BaseModel
 
 
-### models flog.xml files
+# models flog.xml files
 class FunctionCall(BaseModel):
     ts: str
     fncall_id: str
@@ -42,7 +42,7 @@ class Flog(BaseModel):
     analysis: Analysis
 
 
-### models for summary_v2.json files
+# models for summary_v2.json files
 class GenericReference(BaseModel):
     path: List[str]
     source: str
diff --git a/scripts/show-features.py b/scripts/show-features.py
index b46432db8..6cfb100ed 100644
--- a/scripts/show-features.py
+++ b/scripts/show-features.py
@@ -229,7 +229,7 @@ def print_dynamic_features(processes, extractor: DynamicFeatureExtractor):
     for p in processes:
         print(f"proc: {extractor.get_process_name(p)} (ppid={p.address.ppid}, pid={p.address.pid})")
 
-        for feature, addr in extractor.extract_process_features(p):
+        for feature, _ in extractor.extract_process_features(p):
             if is_global_feature(feature):
                 continue
 

From 29fa3153b1417e9db89c0dbf32e8e166e422ccae Mon Sep 17 00:00:00 2001
From: Mike Hunhoff <mike.hunhoff@gmail.com>
Date: Thu, 20 Jun 2024 14:17:42 -0600
Subject: [PATCH 035/105] vmray: fix deptry lints

---
 pyproject.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pyproject.toml b/pyproject.toml
index 178d8cbc4..e159fac29 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -79,7 +79,7 @@ dependencies = [
     "rich>=13",
     "humanize>=4",
     "protobuf>=5",
-    "pydantic_xml[lxml]>=2.11",  # TODO benchmark lxml vs. elementtree - first impression eltree faster
+    "xmltodict>=0.13.0",
 
     # ---------------------------------------
     # Dependencies that we develop

From 9df611ff139a2e1507369d1ee30bd36cf7410733 Mon Sep 17 00:00:00 2001
From: Mike Hunhoff <mike.hunhoff@gmail.com>
Date: Thu, 20 Jun 2024 14:41:50 -0600
Subject: [PATCH 036/105] vmray: add comments

---
 capa/features/extractors/vmray/__init__.py  | 4 +++-
 capa/features/extractors/vmray/extractor.py | 1 +
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/capa/features/extractors/vmray/__init__.py b/capa/features/extractors/vmray/__init__.py
index 35c9c0fc9..5e7c3fa93 100644
--- a/capa/features/extractors/vmray/__init__.py
+++ b/capa/features/extractors/vmray/__init__.py
@@ -40,7 +40,7 @@ def __init__(self, sv2: SummaryV2, flog: Flog):
     def _find_sample_file(self):
         for file_name, file_analysis in self.sv2.files.items():
             if file_analysis.is_sample:
-                # this indicates the sample submitted for analysis??
+                # target the sample submitted for analysis
                 self.sample_file_name = file_name
                 self.sample_file_analysis = file_analysis
 
@@ -68,6 +68,8 @@ def _compute_sections(self):
                 self.sections[section.virtual_address] = section.name
 
     def _compute_process_threads(self):
+        # logs/flog.xml appears to be the only file that contains thread-related
+        # so we use it here to map processes to threads
         for function_call in self.flog.analysis.function_calls:
             pid: int = int(function_call.process_id)
             tid: int = int(function_call.thread_id)
diff --git a/capa/features/extractors/vmray/extractor.py b/capa/features/extractors/vmray/extractor.py
index 236ed9451..870e839d9 100644
--- a/capa/features/extractors/vmray/extractor.py
+++ b/capa/features/extractors/vmray/extractor.py
@@ -95,6 +95,7 @@ def get_call_name(self, ph, th, ch) -> str:
     @classmethod
     def from_zipfile(cls, zipfile_path: Path):
         with ZipFile(zipfile_path, "r") as zipfile:
+            # TODO (meh): is default password "infected" good enough?? https://github.com/mandiant/capa/issues/2148
             sv2_json = json.loads(zipfile.read("logs/summary_v2.json", pwd=b"infected"))
             sv2 = SummaryV2.model_validate(sv2_json)
 

From ec6c9c93bdd5c50e836dcaa79e3eb18233c9330c Mon Sep 17 00:00:00 2001
From: Mike Hunhoff <mike.hunhoff@gmail.com>
Date: Thu, 20 Jun 2024 14:42:42 -0600
Subject: [PATCH 037/105] vmray: remove unused fields from summary_v2 pydantic
 models

---
 capa/features/extractors/vmray/models.py | 94 ++++++++++++------------
 1 file changed, 47 insertions(+), 47 deletions(-)

diff --git a/capa/features/extractors/vmray/models.py b/capa/features/extractors/vmray/models.py
index 22db5a1d6..f220e7906 100644
--- a/capa/features/extractors/vmray/models.py
+++ b/capa/features/extractors/vmray/models.py
@@ -52,16 +52,16 @@ class StaticDataReference(GenericReference): ...
 
 
 class PEFileBasicInfo(BaseModel):
-    compile_time: str
+    # compile_time: str
     file_type: str
     image_base: int
     machine_type: str
-    size_of_code: int
-    size_of_initialized_data: int
-    size_of_uninitialized_data: int
-    subsystem: str
-    entry_point: int
-    imphash: Optional[str] = None
+    # size_of_code: int
+    # size_of_initialized_data: int
+    # size_of_uninitialized_data: int
+    # subsystem: str
+    # entry_point: int
+    # imphash: Optional[str] = None
 
 
 class API(BaseModel):
@@ -77,9 +77,9 @@ class PEFileExport(BaseModel):
 class PEFileImport(BaseModel):
     address: int
     api: API
-    thunk_offset: int
-    hint: Optional[int] = None
-    thunk_rva: int
+    # thunk_offset: int
+    # hint: Optional[int] = None
+    # thunk_rva: int
 
 
 class PEFileImportModule(BaseModel):
@@ -88,13 +88,13 @@ class PEFileImportModule(BaseModel):
 
 
 class PEFileSection(BaseModel):
-    entropy: float
-    flags: List[str] = []
+    # entropy: float
+    # flags: List[str] = []
     name: str
-    raw_data_offset: int
-    raw_data_size: int
+    # raw_data_offset: int
+    # raw_data_size: int
     virtual_address: int
-    virtual_size: int
+    # virtual_size: int
 
 
 class PEFile(BaseModel):
@@ -112,32 +112,32 @@ class FileHashes(BaseModel):
     md5: str
     sha1: str
     sha256: str
-    ssdeep: str
+    # ssdeep: str
 
 
 class File(BaseModel):
-    categories: List[str]
+    # categories: List[str]
     hash_values: FileHashes
-    is_artifact: bool
-    is_ioc: bool
+    # is_artifact: bool
+    # is_ioc: bool
     is_sample: bool
-    size: int
-    is_truncated: bool
-    mime_type: Optional[str] = None
-    operations: List[str] = []
-    ref_filenames: List[GenericReference] = []
-    ref_gfncalls: List[GenericReference] = []
+    # size: int
+    # is_truncated: bool
+    # mime_type: Optional[str] = None
+    # operations: List[str] = []
+    # ref_filenames: List[GenericReference] = []
+    # ref_gfncalls: List[GenericReference] = []
     ref_static_data: Optional[StaticDataReference] = None
-    ref_vti_matches: List[GenericReference] = []
-    verdict: str
+    # ref_vti_matches: List[GenericReference] = []
+    # verdict: str
 
 
 class Process(BaseModel):
-    bitness: int
-    is_artifact: bool
-    is_ioc: bool
+    # bitness: int
+    # is_artifact: bool
+    # is_ioc: bool
     monitor_id: int
-    monitor_reason: str
+    # monitor_reason: str
     os_pid: int
     filename: str
     image_name: str
@@ -146,38 +146,38 @@ class Process(BaseModel):
 
 class Filename(BaseModel):
     filename: str
-    is_artifact: bool
-    is_ioc: bool
-    verdict: str
+    # is_artifact: bool
+    # is_ioc: bool
+    # verdict: str
 
 
 class Mutex(BaseModel):
     name: str
-    is_artifact: bool
-    is_ioc: bool
-    verdict: str
+    # is_artifact: bool
+    # is_ioc: bool
+    # verdict: str
 
 
 class Registry(BaseModel):
     reg_key_name: str
-    reg_key_value_type: Optional[str] = None
-    is_artifact: bool
-    is_ioc: bool
-    verdict: str
+    # reg_key_value_type: Optional[str] = None
+    # is_artifact: bool
+    # is_ioc: bool
+    # verdict: str
 
 
 class Domain(BaseModel):
     domain: str
-    is_artifact: bool
-    is_ioc: bool
-    verdict: str
+    # is_artifact: bool
+    # is_ioc: bool
+    # verdict: str
 
 
 class IPAddress(BaseModel):
     ip_address: str
-    is_artifact: bool
-    is_ioc: bool
-    verdict: str
+    # is_artifact: bool
+    # is_ioc: bool
+    # verdict: str
 
 
 class AnalysisMetadata(BaseModel):

From 9be35f9a8dc05182b41fe779b5ba9bef95439e9f Mon Sep 17 00:00:00 2001
From: Mike Hunhoff <mike.hunhoff@gmail.com>
Date: Thu, 20 Jun 2024 15:19:55 -0600
Subject: [PATCH 038/105] vmray: remove unneeded unpacking

---
 capa/features/extractors/vmray/file.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/capa/features/extractors/vmray/file.py b/capa/features/extractors/vmray/file.py
index 45b0d0afa..16db80dbb 100644
--- a/capa/features/extractors/vmray/file.py
+++ b/capa/features/extractors/vmray/file.py
@@ -21,7 +21,7 @@
 def get_processes(analysis: VMRayAnalysis) -> Iterator[ProcessHandle]:
     processes: Dict[str, Process] = analysis.sv2.processes
 
-    for _, process in processes.items():
+    for process in processes.values():
         # TODO (meh): should we use the OS process ID or vmray-assigned ID? https://github.com/mandiant/capa/issues/2148
         pid = process.monitor_id
         ppid = processes[process.ref_parent_process.path[1]].monitor_id if process.ref_parent_process else 0
@@ -46,27 +46,27 @@ def extract_section_names(analysis: VMRayAnalysis) -> Iterator[Tuple[Feature, Ad
 
 
 def extract_referenced_filenames(analysis: VMRayAnalysis) -> Iterator[Tuple[Feature, Address]]:
-    for _, filename in analysis.sv2.filenames.items():
+    for filename in analysis.sv2.filenames.values():
         yield String(filename.filename), NO_ADDRESS
 
 
 def extract_referenced_mutex_names(analysis: VMRayAnalysis) -> Iterator[Tuple[Feature, Address]]:
-    for _, mutex in analysis.sv2.mutexes.items():
+    for mutex in analysis.sv2.mutexes.values():
         yield String(mutex.name), NO_ADDRESS
 
 
 def extract_referenced_domain_names(analysis: VMRayAnalysis) -> Iterator[Tuple[Feature, Address]]:
-    for _, domain in analysis.sv2.domains.items():
+    for domain in analysis.sv2.domains.values():
         yield String(domain.domain), NO_ADDRESS
 
 
 def extract_referenced_ip_addresses(analysis: VMRayAnalysis) -> Iterator[Tuple[Feature, Address]]:
-    for _, ip_address in analysis.sv2.ip_addresses.items():
+    for ip_address in analysis.sv2.ip_addresses.values():
         yield String(ip_address.ip_address), NO_ADDRESS
 
 
 def extract_referenced_registry_key_names(analysis: VMRayAnalysis) -> Iterator[Tuple[Feature, Address]]:
-    for _, registry_record in analysis.sv2.registry_records.items():
+    for registry_record in analysis.sv2.registry_records.values():
         yield String(registry_record.reg_key_name), NO_ADDRESS
 
 

From 81581fe85e9a1ebc983b24bb1f3e0243ce611392 Mon Sep 17 00:00:00 2001
From: Mike Hunhoff <mike.hunhoff@gmail.com>
Date: Fri, 12 Jul 2024 10:15:28 -0600
Subject: [PATCH 039/105] vmray: emit string file featureS

---
 capa/features/extractors/vmray/__init__.py  | 31 +++++++++++++++++++--
 capa/features/extractors/vmray/extractor.py | 18 ++----------
 capa/features/extractors/vmray/file.py      |  7 ++++-
 3 files changed, 37 insertions(+), 19 deletions(-)

diff --git a/capa/features/extractors/vmray/__init__.py b/capa/features/extractors/vmray/__init__.py
index 5e7c3fa93..36e396a9d 100644
--- a/capa/features/extractors/vmray/__init__.py
+++ b/capa/features/extractors/vmray/__init__.py
@@ -5,17 +5,35 @@
 # Unless required by applicable law or agreed to in writing, software distributed under the License
 #  is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and limitations under the License.
+import json
+import logging
 from typing import Dict, List
+from pathlib import Path
+from zipfile import ZipFile
 from collections import defaultdict
 
+import xmltodict
+
 from capa.exceptions import UnsupportedFormatError
 from capa.features.extractors.vmray.models import File, Flog, SummaryV2, StaticData, FunctionCall
 
+logger = logging.getLogger(__name__)
+
+# TODO (meh): is default password "infected" good enough?? https://github.com/mandiant/capa/issues/2148
+DEFAULT_ARCHIVE_PASSWORD = b"infected"
+
 
 class VMRayAnalysis:
-    def __init__(self, sv2: SummaryV2, flog: Flog):
-        self.sv2 = sv2  # logs/summary_v2.json
-        self.flog = flog  # logs/flog.xml
+    def __init__(self, zipfile_path: Path):
+        self.zipfile = ZipFile(zipfile_path, "r")
+
+        sv2_json = json.loads(self.zipfile.read("logs/summary_v2.json", pwd=DEFAULT_ARCHIVE_PASSWORD))
+        self.sv2 = SummaryV2.model_validate(sv2_json)
+
+        flog_xml = self.zipfile.read("logs/flog.xml", pwd=DEFAULT_ARCHIVE_PASSWORD)
+        flog_json = xmltodict.parse(flog_xml, attr_prefix="")
+        self.flog = Flog.model_validate(flog_json)
+
         self.exports: Dict[int, str] = {}
         self.imports: Dict[int, str] = {}
         self.sections: Dict[int, str] = {}
@@ -37,6 +55,13 @@ def __init__(self, sv2: SummaryV2, flog: Flog):
         if not self.sample_file_static_data.pe:
             raise UnsupportedFormatError("VMRay feature extractor only supports PE at this time")
 
+        sample_sha256: str = self.sample_file_analysis.hash_values.sha256.lower()
+        sample_file_path: str = f"internal/static_analyses/{sample_sha256}/objects/files/{sample_sha256}"
+
+        logger.debug("sample file path: %s", sample_file_path)
+
+        self.sample_file_buf: bytes = self.zipfile.read(sample_file_path, pwd=DEFAULT_ARCHIVE_PASSWORD)
+
     def _find_sample_file(self):
         for file_name, file_analysis in self.sv2.files.items():
             if file_analysis.is_sample:
diff --git a/capa/features/extractors/vmray/extractor.py b/capa/features/extractors/vmray/extractor.py
index 870e839d9..a8bdf6875 100644
--- a/capa/features/extractors/vmray/extractor.py
+++ b/capa/features/extractors/vmray/extractor.py
@@ -6,12 +6,9 @@
 #  is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and limitations under the License.
 
-import json
+
 from typing import Tuple, Iterator
 from pathlib import Path
-from zipfile import ZipFile
-
-import xmltodict
 
 import capa.helpers
 import capa.features.extractors.vmray.call
@@ -20,7 +17,7 @@
 from capa.features.common import Feature, Characteristic
 from capa.features.address import NO_ADDRESS, Address, ThreadAddress, DynamicCallAddress, AbsoluteVirtualAddress
 from capa.features.extractors.vmray import VMRayAnalysis
-from capa.features.extractors.vmray.models import Flog, Process, SummaryV2
+from capa.features.extractors.vmray.models import Process
 from capa.features.extractors.base_extractor import (
     CallHandle,
     SampleHashes,
@@ -94,13 +91,4 @@ def get_call_name(self, ph, th, ch) -> str:
 
     @classmethod
     def from_zipfile(cls, zipfile_path: Path):
-        with ZipFile(zipfile_path, "r") as zipfile:
-            # TODO (meh): is default password "infected" good enough?? https://github.com/mandiant/capa/issues/2148
-            sv2_json = json.loads(zipfile.read("logs/summary_v2.json", pwd=b"infected"))
-            sv2 = SummaryV2.model_validate(sv2_json)
-
-            flog_xml = zipfile.read("logs/flog.xml", pwd=b"infected")
-            flog_json = xmltodict.parse(flog_xml, attr_prefix="")
-            flog = Flog.model_validate(flog_json)
-
-        return cls(VMRayAnalysis(sv2, flog))
+        return cls(VMRayAnalysis(zipfile_path))
diff --git a/capa/features/extractors/vmray/file.py b/capa/features/extractors/vmray/file.py
index 16db80dbb..dd9c76b95 100644
--- a/capa/features/extractors/vmray/file.py
+++ b/capa/features/extractors/vmray/file.py
@@ -8,6 +8,7 @@
 import logging
 from typing import Dict, Tuple, Iterator
 
+import capa.features.extractors.common
 from capa.features.file import Export, Section
 from capa.features.common import String, Feature
 from capa.features.address import NO_ADDRESS, Address, ProcessAddress, AbsoluteVirtualAddress
@@ -70,6 +71,10 @@ def extract_referenced_registry_key_names(analysis: VMRayAnalysis) -> Iterator[T
         yield String(registry_record.reg_key_name), NO_ADDRESS
 
 
+def extract_file_strings(analysis: VMRayAnalysis) -> Iterator[Tuple[Feature, Address]]:
+    yield from capa.features.extractors.common.extract_file_strings(analysis.sample_file_buf)
+
+
 def extract_features(analysis: VMRayAnalysis) -> Iterator[Tuple[Feature, Address]]:
     for handler in FILE_HANDLERS:
         for feature, addr in handler(analysis):
@@ -85,5 +90,5 @@ def extract_features(analysis: VMRayAnalysis) -> Iterator[Tuple[Feature, Address
     extract_referenced_domain_names,
     extract_referenced_ip_addresses,
     extract_referenced_registry_key_names,
-    # extract_file_strings,
+    extract_file_strings,
 )

From aad4854a617653e4388b9ad52bfa885b8085a9f4 Mon Sep 17 00:00:00 2001
From: Mike Hunhoff <mike.hunhoff@gmail.com>
Date: Fri, 12 Jul 2024 11:33:13 -0600
Subject: [PATCH 040/105] vmray: use process OS PID instead of monitor ID

---
 capa/features/extractors/vmray/__init__.py  | 26 +++++++++++++++++----
 capa/features/extractors/vmray/extractor.py |  2 +-
 capa/features/extractors/vmray/file.py      | 13 +++++++----
 capa/features/extractors/vmray/models.py    | 24 ++++++++++++++-----
 4 files changed, 49 insertions(+), 16 deletions(-)

diff --git a/capa/features/extractors/vmray/__init__.py b/capa/features/extractors/vmray/__init__.py
index 36e396a9d..153f41af5 100644
--- a/capa/features/extractors/vmray/__init__.py
+++ b/capa/features/extractors/vmray/__init__.py
@@ -37,6 +37,7 @@ def __init__(self, zipfile_path: Path):
         self.exports: Dict[int, str] = {}
         self.imports: Dict[int, str] = {}
         self.sections: Dict[int, str] = {}
+        self.process_ids: Dict[int, int] = {}
         self.process_threads: Dict[int, List[int]] = defaultdict(list)
         self.process_calls: Dict[int, Dict[int, List[FunctionCall]]] = defaultdict(lambda: defaultdict(list))
         self.base_address: int
@@ -49,6 +50,7 @@ def __init__(self, zipfile_path: Path):
         self._compute_base_address()
         self._compute_exports()
         self._compute_sections()
+        self._compute_process_ids()
         self._compute_process_threads()
         self._compute_process_calls()
 
@@ -92,19 +94,35 @@ def _compute_sections(self):
             for section in self.sample_file_static_data.pe.sections:
                 self.sections[section.virtual_address] = section.name
 
+    def _compute_process_ids(self):
+        for process in self.sv2.processes.values():
+            assert process.monitor_id not in self.process_ids.keys()
+            assert process.os_pid not in self.process_ids.values()
+
+            self.process_ids[process.monitor_id] = process.os_pid
+
     def _compute_process_threads(self):
         # logs/flog.xml appears to be the only file that contains thread-related
         # so we use it here to map processes to threads
         for function_call in self.flog.analysis.function_calls:
-            pid: int = int(function_call.process_id)
-            tid: int = int(function_call.thread_id)
+            pid: int = self.get_process_os_pid(function_call.process_id)  # flog.xml uses process monitor ID, not OS PID
+            tid: int = function_call.thread_id
+
+            assert isinstance(pid, int)
+            assert isinstance(tid, int)
 
             if tid not in self.process_threads[pid]:
                 self.process_threads[pid].append(tid)
 
     def _compute_process_calls(self):
         for function_call in self.flog.analysis.function_calls:
-            pid: int = int(function_call.process_id)
-            tid: int = int(function_call.thread_id)
+            pid: int = self.get_process_os_pid(function_call.process_id)  # flog.xml uses process monitor ID, not OS PID
+            tid: int = function_call.thread_id
+
+            assert isinstance(pid, int)
+            assert isinstance(tid, int)
 
             self.process_calls[pid][tid].append(function_call)
+
+    def get_process_os_pid(self, monitor_id: int) -> int:
+        return self.process_ids[monitor_id]
diff --git a/capa/features/extractors/vmray/extractor.py b/capa/features/extractors/vmray/extractor.py
index a8bdf6875..e8270be52 100644
--- a/capa/features/extractors/vmray/extractor.py
+++ b/capa/features/extractors/vmray/extractor.py
@@ -77,7 +77,7 @@ def extract_thread_features(self, ph: ProcessHandle, th: ThreadHandle) -> Iterat
 
     def get_calls(self, ph: ProcessHandle, th: ThreadHandle) -> Iterator[CallHandle]:
         for function_call in self.analysis.process_calls[ph.address.pid][th.address.tid]:
-            addr = DynamicCallAddress(thread=th.address, id=int(function_call.fncall_id))
+            addr = DynamicCallAddress(thread=th.address, id=function_call.fncall_id)
             yield CallHandle(address=addr, inner=function_call)
 
     def extract_call_features(
diff --git a/capa/features/extractors/vmray/file.py b/capa/features/extractors/vmray/file.py
index dd9c76b95..a91dd40aa 100644
--- a/capa/features/extractors/vmray/file.py
+++ b/capa/features/extractors/vmray/file.py
@@ -23,11 +23,14 @@ def get_processes(analysis: VMRayAnalysis) -> Iterator[ProcessHandle]:
     processes: Dict[str, Process] = analysis.sv2.processes
 
     for process in processes.values():
-        # TODO (meh): should we use the OS process ID or vmray-assigned ID? https://github.com/mandiant/capa/issues/2148
-        pid = process.monitor_id
-        ppid = processes[process.ref_parent_process.path[1]].monitor_id if process.ref_parent_process else 0
-
-        addr = ProcessAddress(pid=int(pid), ppid=int(ppid))
+        pid: int = analysis.get_process_os_pid(process.monitor_id)
+        ppid: int = (
+            analysis.get_process_os_pid(processes[process.ref_parent_process.path[1]].monitor_id)
+            if process.ref_parent_process
+            else 0
+        )
+
+        addr: ProcessAddress = ProcessAddress(pid=pid, ppid=ppid)
         yield ProcessHandle(address=addr, inner=process)
 
 
diff --git a/capa/features/extractors/vmray/models.py b/capa/features/extractors/vmray/models.py
index f220e7906..d413e9d7f 100644
--- a/capa/features/extractors/vmray/models.py
+++ b/capa/features/extractors/vmray/models.py
@@ -9,22 +9,34 @@
 from typing import Dict, List, Optional
 
 from pydantic import Field, BaseModel
+from typing_extensions import Annotated
+from pydantic.functional_validators import BeforeValidator
+
+
+def validate_hex_int(value):
+    if isinstance(value, str):
+        return int(value, 16) if value.startswith("0x") else int(value, 10)
+    else:
+        return value
+
+
+HexInt = Annotated[int, BeforeValidator(validate_hex_int)]
 
 
 # models flog.xml files
 class FunctionCall(BaseModel):
-    ts: str
-    fncall_id: str
-    process_id: str
-    thread_id: str
+    ts: HexInt
+    fncall_id: HexInt
+    process_id: HexInt
+    thread_id: HexInt
     name: str
     addr: str
     from_addr: str = Field(alias="from")
 
 
 class FunctionReturn(BaseModel):
-    ts: str
-    fncall_id: str
+    ts: HexInt
+    fncall_id: HexInt
     addr: str
     from_addr: str = Field(alias="from")
 

From bcdaa80dfa6813d8ed4ab20d9d8fb7958316be81 Mon Sep 17 00:00:00 2001
From: Mike Hunhoff <mike.hunhoff@gmail.com>
Date: Fri, 12 Jul 2024 13:34:30 -0600
Subject: [PATCH 041/105] vmray: emit file import features

---
 capa/features/extractors/vmray/__init__.py | 11 +++++++----
 capa/features/extractors/vmray/file.py     |  8 +++++---
 2 files changed, 12 insertions(+), 7 deletions(-)

diff --git a/capa/features/extractors/vmray/__init__.py b/capa/features/extractors/vmray/__init__.py
index 153f41af5..0c3220944 100644
--- a/capa/features/extractors/vmray/__init__.py
+++ b/capa/features/extractors/vmray/__init__.py
@@ -7,7 +7,7 @@
 # See the License for the specific language governing permissions and limitations under the License.
 import json
 import logging
-from typing import Dict, List
+from typing import Dict, List, Tuple
 from pathlib import Path
 from zipfile import ZipFile
 from collections import defaultdict
@@ -35,7 +35,7 @@ def __init__(self, zipfile_path: Path):
         self.flog = Flog.model_validate(flog_json)
 
         self.exports: Dict[int, str] = {}
-        self.imports: Dict[int, str] = {}
+        self.imports: Dict[int, Tuple[str, str]] = {}
         self.sections: Dict[int, str] = {}
         self.process_ids: Dict[int, int] = {}
         self.process_threads: Dict[int, List[int]] = defaultdict(list)
@@ -48,6 +48,7 @@ def __init__(self, zipfile_path: Path):
 
         self._find_sample_file()
         self._compute_base_address()
+        self._compute_imports()
         self._compute_exports()
         self._compute_sections()
         self._compute_process_ids()
@@ -86,8 +87,10 @@ def _compute_exports(self):
                 self.exports[export.address] = export.api.name
 
     def _compute_imports(self):
-        # TODO (meh): https://github.com/mandiant/capa/issues/2148
-        ...
+        if self.sample_file_static_data.pe:
+            for module in self.sample_file_static_data.pe.imports:
+                for api in module.apis:
+                    self.imports[api.address] = (module.dll, api.api.name)
 
     def _compute_sections(self):
         if self.sample_file_static_data.pe:
diff --git a/capa/features/extractors/vmray/file.py b/capa/features/extractors/vmray/file.py
index a91dd40aa..93feb22f2 100644
--- a/capa/features/extractors/vmray/file.py
+++ b/capa/features/extractors/vmray/file.py
@@ -9,10 +9,11 @@
 from typing import Dict, Tuple, Iterator
 
 import capa.features.extractors.common
-from capa.features.file import Export, Section
+from capa.features.file import Export, Import, Section
 from capa.features.common import String, Feature
 from capa.features.address import NO_ADDRESS, Address, ProcessAddress, AbsoluteVirtualAddress
 from capa.features.extractors.vmray import VMRayAnalysis
+from capa.features.extractors.helpers import generate_symbols
 from capa.features.extractors.vmray.models import Process
 from capa.features.extractors.base_extractor import ProcessHandle
 
@@ -40,8 +41,9 @@ def extract_export_names(analysis: VMRayAnalysis) -> Iterator[Tuple[Feature, Add
 
 
 def extract_import_names(analysis: VMRayAnalysis) -> Iterator[Tuple[Feature, Address]]:
-    # TODO (meh): https://github.com/mandiant/capa/issues/2148
-    yield from []
+    for addr, name in analysis.imports.items():
+        for symbol in generate_symbols(name[0], name[1], include_dll=True):
+            yield Import(symbol), AbsoluteVirtualAddress(addr)
 
 
 def extract_section_names(analysis: VMRayAnalysis) -> Iterator[Tuple[Feature, Address]]:

From da0545780beb92200226afba1e06ca253d148cac Mon Sep 17 00:00:00 2001
From: Mike Hunhoff <mike.hunhoff@gmail.com>
Date: Fri, 12 Jul 2024 16:25:56 -0600
Subject: [PATCH 042/105] vmray: emit number call features for input parameters

---
 capa/features/extractors/vmray/call.py      | 48 +++---------------
 capa/features/extractors/vmray/extractor.py |  6 +--
 capa/features/extractors/vmray/models.py    | 54 ++++++++++++++++++++-
 3 files changed, 63 insertions(+), 45 deletions(-)

diff --git a/capa/features/extractors/vmray/call.py b/capa/features/extractors/vmray/call.py
index a653e6602..63bc6f487 100644
--- a/capa/features/extractors/vmray/call.py
+++ b/capa/features/extractors/vmray/call.py
@@ -8,56 +8,24 @@
 import logging
 from typing import Tuple, Iterator
 
-from capa.helpers import assert_never
 from capa.features.insn import API, Number
-from capa.features.common import String, Feature
+from capa.features.common import Feature
 from capa.features.address import Address
+from capa.features.extractors.vmray.models import PARAM_TYPE_PTR, FunctionCall
 from capa.features.extractors.base_extractor import CallHandle, ThreadHandle, ProcessHandle
 
 logger = logging.getLogger(__name__)
 
 
 def extract_call_features(ph: ProcessHandle, th: ThreadHandle, ch: CallHandle) -> Iterator[Tuple[Feature, Address]]:
-    """
-    this method extracts the given call's features (such as API name and arguments),
-    and returns them as API, Number, and String features.
+    call: FunctionCall = ch.inner
 
-    args:
-      call: FunctionCall object representing the XML fncall element
+    if call.params_in:
+        for param in call.params_in.params:
+            if param.type_ not in PARAM_TYPE_PTR:
+                yield Number(param.value), ch.address
 
-      yields: Feature, address; where Feature is either: API, Number, or String.
-    """
-
-    # TODO (meh): update for new models https://github.com/mandiant/capa/issues/2148
-    # print(ch)
-    return
-
-    # Extract API name
-    yield API(ch.inner.name), ch.inner.address
-
-    # Extract arguments from <in>
-    for param in ch.inner.in_:
-        value = param.value
-        if isinstance(value, str):
-            yield String(value), ch.inner.address
-
-        elif isinstance(value, int):
-            yield Number(value), ch.inner.address
-
-        else:
-            assert_never(value)
-
-    # Extract return value from <out>
-    if ch.inner.out is not None:
-        value = ch.inner.out.value
-        if isinstance(value, str):
-            yield String(value), ch.inner.address
-
-        elif isinstance(value, int):
-            yield Number(value), ch.inner.address
-
-        else:
-            assert_never(value)
+    yield API(call.name), ch.address
 
 
 def extract_features(ph: ProcessHandle, th: ThreadHandle, ch: CallHandle) -> Iterator[Tuple[Feature, Address]]:
diff --git a/capa/features/extractors/vmray/extractor.py b/capa/features/extractors/vmray/extractor.py
index e8270be52..e535cfe74 100644
--- a/capa/features/extractors/vmray/extractor.py
+++ b/capa/features/extractors/vmray/extractor.py
@@ -17,7 +17,7 @@
 from capa.features.common import Feature, Characteristic
 from capa.features.address import NO_ADDRESS, Address, ThreadAddress, DynamicCallAddress, AbsoluteVirtualAddress
 from capa.features.extractors.vmray import VMRayAnalysis
-from capa.features.extractors.vmray.models import Process
+from capa.features.extractors.vmray.models import Process, FunctionCall
 from capa.features.extractors.base_extractor import (
     CallHandle,
     SampleHashes,
@@ -86,8 +86,8 @@ def extract_call_features(
         yield from capa.features.extractors.vmray.call.extract_features(ph, th, ch)
 
     def get_call_name(self, ph, th, ch) -> str:
-        # TODO (meh): https://github.com/mandiant/capa/issues/2148
-        raise NotImplementedError()
+        call: FunctionCall = ch.inner
+        return call.name
 
     @classmethod
     def from_zipfile(cls, zipfile_path: Path):
diff --git a/capa/features/extractors/vmray/models.py b/capa/features/extractors/vmray/models.py
index d413e9d7f..e496c0027 100644
--- a/capa/features/extractors/vmray/models.py
+++ b/capa/features/extractors/vmray/models.py
@@ -12,6 +12,33 @@
 from typing_extensions import Annotated
 from pydantic.functional_validators import BeforeValidator
 
+"""
+# possible param types, included for documentation
+PARAM_TYPE = (
+    "signed_8bit",
+    "unsigned_8bit",
+    "signed_16bit",
+    "unsigned_16bit",
+    "signed_32bit",
+    "unsigned_32bit",
+    "signed_64bit",
+    "unsigned_64bit",
+    "double",
+    "void_ptr",
+    "bool",
+    "unknown",
+    "ptr",
+    "void",
+    "str",
+    "array",
+    "container",
+    "bindata",
+    "undefined_type",
+)
+"""
+
+PARAM_TYPE_PTR = ("void_ptr", "ptr")
+
 
 def validate_hex_int(value):
     if isinstance(value, str):
@@ -20,9 +47,31 @@ def validate_hex_int(value):
         return value
 
 
+def validate_param_list(value):
+    if isinstance(value, list):
+        return value
+    else:
+        return [value]
+
+
 HexInt = Annotated[int, BeforeValidator(validate_hex_int)]
 
 
+class Param(BaseModel):
+    name: str
+    type_: str = Field(alias="type")
+    value: HexInt
+
+
+# params may be stored as a list of Param or a single Param
+# so we ensure a list is used for single Param as well
+ParamList = Annotated[List[Param], BeforeValidator(validate_param_list)]
+
+
+class Params(BaseModel):
+    params: ParamList = Field(alias="param")
+
+
 # models flog.xml files
 class FunctionCall(BaseModel):
     ts: HexInt
@@ -30,8 +79,9 @@ class FunctionCall(BaseModel):
     process_id: HexInt
     thread_id: HexInt
     name: str
-    addr: str
-    from_addr: str = Field(alias="from")
+    addr: HexInt
+    from_addr: HexInt = Field(alias="from")
+    params_in: Params = Field(alias="in", default=None)
 
 
 class FunctionReturn(BaseModel):

From 5b7a0cad5fa7c4327ba05a5674a9d112f11d3eab Mon Sep 17 00:00:00 2001
From: Mike Hunhoff <mike.hunhoff@gmail.com>
Date: Fri, 12 Jul 2024 16:36:28 -0600
Subject: [PATCH 043/105] vmray: emit number call features for output
 parameters

---
 capa/features/extractors/vmray/call.py   | 7 ++++++-
 capa/features/extractors/vmray/models.py | 3 ++-
 2 files changed, 8 insertions(+), 2 deletions(-)

diff --git a/capa/features/extractors/vmray/call.py b/capa/features/extractors/vmray/call.py
index 63bc6f487..45fc73eae 100644
--- a/capa/features/extractors/vmray/call.py
+++ b/capa/features/extractors/vmray/call.py
@@ -22,7 +22,12 @@ def extract_call_features(ph: ProcessHandle, th: ThreadHandle, ch: CallHandle) -
 
     if call.params_in:
         for param in call.params_in.params:
-            if param.type_ not in PARAM_TYPE_PTR:
+            if param.type_ not in PARAM_TYPE_PTR and param.value is not None:
+                yield Number(param.value), ch.address
+
+    if call.params_out:
+        for param in call.params_out.params:
+            if param.type_ not in PARAM_TYPE_PTR and param.value is not None:
                 yield Number(param.value), ch.address
 
     yield API(call.name), ch.address
diff --git a/capa/features/extractors/vmray/models.py b/capa/features/extractors/vmray/models.py
index e496c0027..3c039f330 100644
--- a/capa/features/extractors/vmray/models.py
+++ b/capa/features/extractors/vmray/models.py
@@ -60,7 +60,7 @@ def validate_param_list(value):
 class Param(BaseModel):
     name: str
     type_: str = Field(alias="type")
-    value: HexInt
+    value: Optional[HexInt] = None
 
 
 # params may be stored as a list of Param or a single Param
@@ -82,6 +82,7 @@ class FunctionCall(BaseModel):
     addr: HexInt
     from_addr: HexInt = Field(alias="from")
     params_in: Params = Field(alias="in", default=None)
+    params_out: Params = Field(alias="out", default=None)
 
 
 class FunctionReturn(BaseModel):

From e2f5eb7d307f006536a54e9b989a49e4c671c972 Mon Sep 17 00:00:00 2001
From: Mike Hunhoff <mike.hunhoff@gmail.com>
Date: Fri, 12 Jul 2024 16:43:48 -0600
Subject: [PATCH 044/105] vmray: clean up models

---
 capa/features/extractors/vmray/models.py | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/capa/features/extractors/vmray/models.py b/capa/features/extractors/vmray/models.py
index 3c039f330..7fb9b8f7b 100644
--- a/capa/features/extractors/vmray/models.py
+++ b/capa/features/extractors/vmray/models.py
@@ -85,11 +85,14 @@ class FunctionCall(BaseModel):
     params_out: Params = Field(alias="out", default=None)
 
 
+"""
+# not useful for capa, but included for documentation in case
 class FunctionReturn(BaseModel):
     ts: HexInt
     fncall_id: HexInt
-    addr: str
-    from_addr: str = Field(alias="from")
+    addr: HexInt
+    from_addr: HexInt = Field(alias="from")
+"""
 
 
 class Analysis(BaseModel):
@@ -98,7 +101,7 @@ class Analysis(BaseModel):
     analysis_date: str
 
     function_calls: List[FunctionCall] = Field(alias="fncall", default=[])
-    function_returns: List[FunctionReturn] = Field(alias="fnret", default=[])
+    # function_returns: List[FunctionReturn] = Field(alias="fnret", default=[])
 
 
 class Flog(BaseModel):

From 4bbe9e1ce9fc14480160f3029b281fd4a271b966 Mon Sep 17 00:00:00 2001
From: Mike Hunhoff <mike.hunhoff@gmail.com>
Date: Fri, 12 Jul 2024 18:35:50 -0600
Subject: [PATCH 045/105] vmray: emit number and string call features for
 pointer dereference

---
 capa/features/extractors/vmray/call.py   | 22 ++++++++++++-----
 capa/features/extractors/vmray/models.py | 30 +++++++++++++++++++++---
 2 files changed, 43 insertions(+), 9 deletions(-)

diff --git a/capa/features/extractors/vmray/call.py b/capa/features/extractors/vmray/call.py
index 45fc73eae..e0f1059af 100644
--- a/capa/features/extractors/vmray/call.py
+++ b/capa/features/extractors/vmray/call.py
@@ -9,26 +9,36 @@
 from typing import Tuple, Iterator
 
 from capa.features.insn import API, Number
-from capa.features.common import Feature
+from capa.features.common import String, Feature
 from capa.features.address import Address
-from capa.features.extractors.vmray.models import PARAM_TYPE_PTR, FunctionCall
+from capa.features.extractors.vmray.models import PARAM_TYPE_INT, PARAM_TYPE_STR, Param, FunctionCall, hexint
 from capa.features.extractors.base_extractor import CallHandle, ThreadHandle, ProcessHandle
 
 logger = logging.getLogger(__name__)
 
 
+def get_call_param_features(param: Param, ch: CallHandle) -> Iterator[Tuple[Feature, Address]]:
+    if param.deref is not None:
+        if param.deref.value is not None:
+            if param.deref.type_ in PARAM_TYPE_INT:
+                yield Number(hexint(param.deref.value)), ch.address
+            elif param.deref.type_ in PARAM_TYPE_STR:
+                yield String(param.deref.value), ch.address
+    elif param.value is not None:
+        if param.type_ in PARAM_TYPE_INT:
+            yield Number(hexint(param.value)), ch.address
+
+
 def extract_call_features(ph: ProcessHandle, th: ThreadHandle, ch: CallHandle) -> Iterator[Tuple[Feature, Address]]:
     call: FunctionCall = ch.inner
 
     if call.params_in:
         for param in call.params_in.params:
-            if param.type_ not in PARAM_TYPE_PTR and param.value is not None:
-                yield Number(param.value), ch.address
+            yield from get_call_param_features(param, ch)
 
     if call.params_out:
         for param in call.params_out.params:
-            if param.type_ not in PARAM_TYPE_PTR and param.value is not None:
-                yield Number(param.value), ch.address
+            yield from get_call_param_features(param, ch)
 
     yield API(call.name), ch.address
 
diff --git a/capa/features/extractors/vmray/models.py b/capa/features/extractors/vmray/models.py
index 7fb9b8f7b..9bb46c135 100644
--- a/capa/features/extractors/vmray/models.py
+++ b/capa/features/extractors/vmray/models.py
@@ -6,7 +6,7 @@
 #  is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and limitations under the License.
 
-from typing import Dict, List, Optional
+from typing import Dict, List, Union, Optional
 
 from pydantic import Field, BaseModel
 from typing_extensions import Annotated
@@ -38,15 +38,33 @@
 """
 
 PARAM_TYPE_PTR = ("void_ptr", "ptr")
+PARAM_TYPE_STR = ("str",)
+PARAM_TYPE_INT = (
+    "signed_8bit",
+    "unsigned_8bit",
+    "signed_16bit",
+    "unsigned_16bit",
+    "signed_32bit",
+    "unsigned_32bit",
+    "signed_64bit",
+    "unsigned_64bit",
+    "double",
+    "bool",
+    "unknown",
+)
 
 
-def validate_hex_int(value):
+def hexint(value: Union[str, int]) -> int:
     if isinstance(value, str):
         return int(value, 16) if value.startswith("0x") else int(value, 10)
     else:
         return value
 
 
+def validate_hex_int(value: Union[str, int]) -> int:
+    return hexint(value)
+
+
 def validate_param_list(value):
     if isinstance(value, list):
         return value
@@ -57,10 +75,16 @@ def validate_param_list(value):
 HexInt = Annotated[int, BeforeValidator(validate_hex_int)]
 
 
+class ParamDeref(BaseModel):
+    type_: str = Field(alias="type")
+    value: Optional[str] = None
+
+
 class Param(BaseModel):
     name: str
     type_: str = Field(alias="type")
-    value: Optional[HexInt] = None
+    value: Optional[str] = None
+    deref: Optional[ParamDeref] = None
 
 
 # params may be stored as a list of Param or a single Param

From 06631fc39dbd0683a597582017fdbdbd085027e3 Mon Sep 17 00:00:00 2001
From: Mike Hunhoff <mike.hunhoff@gmail.com>
Date: Fri, 12 Jul 2024 18:42:42 -0600
Subject: [PATCH 046/105] vmray: remove call feature extraction for out
 parameters

---
 capa/features/extractors/vmray/call.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/capa/features/extractors/vmray/call.py b/capa/features/extractors/vmray/call.py
index e0f1059af..e20805bf6 100644
--- a/capa/features/extractors/vmray/call.py
+++ b/capa/features/extractors/vmray/call.py
@@ -36,9 +36,12 @@ def extract_call_features(ph: ProcessHandle, th: ThreadHandle, ch: CallHandle) -
         for param in call.params_in.params:
             yield from get_call_param_features(param, ch)
 
+    """
+    # TODO (meh): params_out stores return value, not sure where to emit this?? https://github.com/mandiant/capa/issues/2148
     if call.params_out:
         for param in call.params_out.params:
             yield from get_call_param_features(param, ch)
+    """
 
     yield API(call.name), ch.address
 

From 931a9b942154febb43a9a76d31778bc87a6816d1 Mon Sep 17 00:00:00 2001
From: Mike Hunhoff <mike.hunhoff@gmail.com>
Date: Fri, 12 Jul 2024 18:44:29 -0600
Subject: [PATCH 047/105] vmray: clean up models

---
 capa/features/extractors/vmray/models.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/capa/features/extractors/vmray/models.py b/capa/features/extractors/vmray/models.py
index 9bb46c135..73a217edc 100644
--- a/capa/features/extractors/vmray/models.py
+++ b/capa/features/extractors/vmray/models.py
@@ -106,7 +106,7 @@ class FunctionCall(BaseModel):
     addr: HexInt
     from_addr: HexInt = Field(alias="from")
     params_in: Params = Field(alias="in", default=None)
-    params_out: Params = Field(alias="out", default=None)
+    # params_out: Params = Field(alias="out", default=None)
 
 
 """

From 85632f698f39a0837fc87bb70a1cfea4c29c035a Mon Sep 17 00:00:00 2001
From: Mike Hunhoff <mike.hunhoff@gmail.com>
Date: Fri, 12 Jul 2024 18:45:53 -0600
Subject: [PATCH 048/105] vmray: clean up models

---
 capa/features/extractors/vmray/models.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/capa/features/extractors/vmray/models.py b/capa/features/extractors/vmray/models.py
index 73a217edc..df576111d 100644
--- a/capa/features/extractors/vmray/models.py
+++ b/capa/features/extractors/vmray/models.py
@@ -98,13 +98,13 @@ class Params(BaseModel):
 
 # models flog.xml files
 class FunctionCall(BaseModel):
-    ts: HexInt
+    # ts: HexInt
     fncall_id: HexInt
     process_id: HexInt
     thread_id: HexInt
     name: str
-    addr: HexInt
-    from_addr: HexInt = Field(alias="from")
+    # addr: HexInt
+    # from_addr: HexInt = Field(alias="from")
     params_in: Params = Field(alias="in", default=None)
     # params_out: Params = Field(alias="out", default=None)
 
@@ -122,7 +122,7 @@ class FunctionReturn(BaseModel):
 class Analysis(BaseModel):
     log_version: str
     analyzer_version: str
-    analysis_date: str
+    # analysis_date: str
 
     function_calls: List[FunctionCall] = Field(alias="fncall", default=[])
     # function_returns: List[FunctionReturn] = Field(alias="fnret", default=[])

From 253d70efac8da25dd5515de73cc8fe01728ff163 Mon Sep 17 00:00:00 2001
From: Mike Hunhoff <mike.hunhoff@gmail.com>
Date: Fri, 12 Jul 2024 18:49:08 -0600
Subject: [PATCH 049/105] vmray: add comments

---
 capa/features/extractors/vmray/call.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/capa/features/extractors/vmray/call.py b/capa/features/extractors/vmray/call.py
index e20805bf6..0ee43b7d3 100644
--- a/capa/features/extractors/vmray/call.py
+++ b/capa/features/extractors/vmray/call.py
@@ -19,6 +19,9 @@
 
 def get_call_param_features(param: Param, ch: CallHandle) -> Iterator[Tuple[Feature, Address]]:
     if param.deref is not None:
+        # pointer types contain a special "deref" member that stores the deref'd value
+        # so we check for this first and ignore Param.value as this always contains the
+        # deref'd pointer value
         if param.deref.value is not None:
             if param.deref.type_ in PARAM_TYPE_INT:
                 yield Number(hexint(param.deref.value)), ch.address

From 307b0cc3273ef70053987f4e4096519bab9eb6d6 Mon Sep 17 00:00:00 2001
From: Mike Hunhoff <mike.hunhoff@gmail.com>
Date: Fri, 12 Jul 2024 18:51:21 -0600
Subject: [PATCH 050/105] vmray: add comments

---
 capa/features/extractors/vmray/file.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/capa/features/extractors/vmray/file.py b/capa/features/extractors/vmray/file.py
index 93feb22f2..c84f0190a 100644
--- a/capa/features/extractors/vmray/file.py
+++ b/capa/features/extractors/vmray/file.py
@@ -24,6 +24,8 @@ def get_processes(analysis: VMRayAnalysis) -> Iterator[ProcessHandle]:
     processes: Dict[str, Process] = analysis.sv2.processes
 
     for process in processes.values():
+        # we map VMRay's monitor ID to the OS PID to make it easier for users
+        # to follow the processes in capa's output
         pid: int = analysis.get_process_os_pid(process.monitor_id)
         ppid: int = (
             analysis.get_process_os_pid(processes[process.ref_parent_process.path[1]].monitor_id)

From 1f5b6ec52c1e08c885286d1b08a2dbe7c0d37bde Mon Sep 17 00:00:00 2001
From: Mike Hunhoff <mike.hunhoff@gmail.com>
Date: Fri, 12 Jul 2024 19:00:48 -0600
Subject: [PATCH 051/105] vmray: improve comments

---
 capa/features/extractors/vmray/__init__.py  | 7 ++++++-
 capa/features/extractors/vmray/extractor.py | 1 +
 capa/features/extractors/vmray/models.py    | 6 ++++--
 3 files changed, 11 insertions(+), 3 deletions(-)

diff --git a/capa/features/extractors/vmray/__init__.py b/capa/features/extractors/vmray/__init__.py
index 0c3220944..6c6c3d137 100644
--- a/capa/features/extractors/vmray/__init__.py
+++ b/capa/features/extractors/vmray/__init__.py
@@ -27,9 +27,12 @@ class VMRayAnalysis:
     def __init__(self, zipfile_path: Path):
         self.zipfile = ZipFile(zipfile_path, "r")
 
+        # summary_v2.json is the entry point to the entire VMRay archive and
+        # we use its data to find everything else that we need for capa
         sv2_json = json.loads(self.zipfile.read("logs/summary_v2.json", pwd=DEFAULT_ARCHIVE_PASSWORD))
         self.sv2 = SummaryV2.model_validate(sv2_json)
 
+        # flog.xml contains all of the call information that VMRay captured during execution
         flog_xml = self.zipfile.read("logs/flog.xml", pwd=DEFAULT_ARCHIVE_PASSWORD)
         flog_json = xmltodict.parse(flog_xml, attr_prefix="")
         self.flog = Flog.model_validate(flog_json)
@@ -58,6 +61,8 @@ def __init__(self, zipfile_path: Path):
         if not self.sample_file_static_data.pe:
             raise UnsupportedFormatError("VMRay feature extractor only supports PE at this time")
 
+        # VMRay does not store static strings for the sample file so we must use the source file
+        # stored in the archive
         sample_sha256: str = self.sample_file_analysis.hash_values.sha256.lower()
         sample_file_path: str = f"internal/static_analyses/{sample_sha256}/objects/files/{sample_sha256}"
 
@@ -105,7 +110,7 @@ def _compute_process_ids(self):
             self.process_ids[process.monitor_id] = process.os_pid
 
     def _compute_process_threads(self):
-        # logs/flog.xml appears to be the only file that contains thread-related
+        # logs/flog.xml appears to be the only file that contains thread-related data
         # so we use it here to map processes to threads
         for function_call in self.flog.analysis.function_calls:
             pid: int = self.get_process_os_pid(function_call.process_id)  # flog.xml uses process monitor ID, not OS PID
diff --git a/capa/features/extractors/vmray/extractor.py b/capa/features/extractors/vmray/extractor.py
index e535cfe74..e1501d3e9 100644
--- a/capa/features/extractors/vmray/extractor.py
+++ b/capa/features/extractors/vmray/extractor.py
@@ -60,6 +60,7 @@ def extract_process_features(self, ph: ProcessHandle) -> Iterator[Tuple[Feature,
         yield from []
 
     def get_process_name(self, ph) -> str:
+        # TODO (meh): bring to parity with cape sandbox extractor https://github.com/mandiant/capa/issues/2148
         process: Process = ph.inner
         return process.image_name
 
diff --git a/capa/features/extractors/vmray/models.py b/capa/features/extractors/vmray/models.py
index df576111d..4ebbb436a 100644
--- a/capa/features/extractors/vmray/models.py
+++ b/capa/features/extractors/vmray/models.py
@@ -72,6 +72,7 @@ def validate_param_list(value):
         return [value]
 
 
+# convert the input value to a Python int type before inner validation is called
 HexInt = Annotated[int, BeforeValidator(validate_hex_int)]
 
 
@@ -87,8 +88,9 @@ class Param(BaseModel):
     deref: Optional[ParamDeref] = None
 
 
-# params may be stored as a list of Param or a single Param
-# so we ensure a list is used for single Param as well
+# params may be stored as a list of Param or a single Param so we ensure
+# the input value to Python list type before the inner validation is called
+# to make it much easier to parse later
 ParamList = Annotated[List[Param], BeforeValidator(validate_param_list)]
 
 

From 26b5870ef4e2514be5dc0bcbe84c095dcad86913 Mon Sep 17 00:00:00 2001
From: Mike Hunhoff <mike.hunhoff@gmail.com>
Date: Fri, 12 Jul 2024 19:06:06 -0600
Subject: [PATCH 052/105] vmray: improve comments

---
 capa/features/extractors/vmray/models.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/capa/features/extractors/vmray/models.py b/capa/features/extractors/vmray/models.py
index 4ebbb436a..d0897c82a 100644
--- a/capa/features/extractors/vmray/models.py
+++ b/capa/features/extractors/vmray/models.py
@@ -72,7 +72,7 @@ def validate_param_list(value):
         return [value]
 
 
-# convert the input value to a Python int type before inner validation is called
+# convert the input value to a Python int type before inner validation (int) is called
 HexInt = Annotated[int, BeforeValidator(validate_hex_int)]
 
 
@@ -88,9 +88,9 @@ class Param(BaseModel):
     deref: Optional[ParamDeref] = None
 
 
-# params may be stored as a list of Param or a single Param so we ensure
-# the input value to Python list type before the inner validation is called
-# to make it much easier to parse later
+# params may be stored as a list of Param or a single Param so we convert
+# the input value to Python list type before the inner validation (List[Param])
+# is called
 ParamList = Annotated[List[Param], BeforeValidator(validate_param_list)]
 
 

From 28c278b9e6ea741f9e2bc609f82ee7420d4f98ef Mon Sep 17 00:00:00 2001
From: Mike Hunhoff <mike.hunhoff@gmail.com>
Date: Fri, 12 Jul 2024 19:09:10 -0600
Subject: [PATCH 053/105] vmray: improve comments

---
 capa/features/extractors/vmray/__init__.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/capa/features/extractors/vmray/__init__.py b/capa/features/extractors/vmray/__init__.py
index 6c6c3d137..ff50ef5d4 100644
--- a/capa/features/extractors/vmray/__init__.py
+++ b/capa/features/extractors/vmray/__init__.py
@@ -35,6 +35,7 @@ def __init__(self, zipfile_path: Path):
         # flog.xml contains all of the call information that VMRay captured during execution
         flog_xml = self.zipfile.read("logs/flog.xml", pwd=DEFAULT_ARCHIVE_PASSWORD)
         flog_json = xmltodict.parse(flog_xml, attr_prefix="")
+        # TODO (meh): we may need to validate support for the analysis version https://github.com/mandiant/capa/issues/2148
         self.flog = Flog.model_validate(flog_json)
 
         self.exports: Dict[int, str] = {}

From 4f2467cae0242de8809f983880900e6f5b98f732 Mon Sep 17 00:00:00 2001
From: Mike Hunhoff <mike.hunhoff@gmail.com>
Date: Fri, 12 Jul 2024 19:53:33 -0600
Subject: [PATCH 054/105] vmray: update CHANGELOG

---
 CHANGELOG.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 531aaa758..f4a44fbdc 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -3,7 +3,7 @@
 ## master (unreleased)
 
 ### New Features
-
+- dynamic: add support for VMRay dynamic sandbox traces #2208 @mike-hunhoff @r-sm2024 @mr-tz
 ### Breaking Changes
 
 ### New Rules (0)

From 5214675eebb72e46e8c29541979ff8e5a1baff57 Mon Sep 17 00:00:00 2001
From: Mike Hunhoff <mike.hunhoff@gmail.com>
Date: Fri, 12 Jul 2024 19:55:06 -0600
Subject: [PATCH 055/105] vmray: update tests.yml

---
 .github/workflows/tests.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
index b02b52061..84c4f2fe3 100644
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -2,9 +2,9 @@ name: CI
 
 on:
   push:
-    branches: [ master, vmray-extractor ]
+    branches: [ master ]
   pull_request:
-    branches: [ master, vmray-extractor ]
+    branches: [ master ]
 
 permissions: read-all
 

From 42fddfbf31e0bdf8ff8d1a56238a6dfe4482c313 Mon Sep 17 00:00:00 2001
From: Mike Hunhoff <mike.hunhoff@gmail.com>
Date: Fri, 12 Jul 2024 20:19:06 -0600
Subject: [PATCH 056/105] vmray: improve comments

---
 capa/features/extractors/vmray/models.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/capa/features/extractors/vmray/models.py b/capa/features/extractors/vmray/models.py
index d0897c82a..db04d8cd7 100644
--- a/capa/features/extractors/vmray/models.py
+++ b/capa/features/extractors/vmray/models.py
@@ -122,8 +122,8 @@ class FunctionReturn(BaseModel):
 
 
 class Analysis(BaseModel):
-    log_version: str
-    analyzer_version: str
+    log_version: str # tested 2
+    analyzer_version: str # tested 2024.2.1
     # analysis_date: str
 
     function_calls: List[FunctionCall] = Field(alias="fncall", default=[])

From af26bef611efae7d391a3d59e42ea14099bc644e Mon Sep 17 00:00:00 2001
From: Mike Hunhoff <mike.hunhoff@gmail.com>
Date: Fri, 12 Jul 2024 20:21:57 -0600
Subject: [PATCH 057/105] vmray: fix lints

---
 capa/features/extractors/vmray/models.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/capa/features/extractors/vmray/models.py b/capa/features/extractors/vmray/models.py
index db04d8cd7..e0e2fe2e5 100644
--- a/capa/features/extractors/vmray/models.py
+++ b/capa/features/extractors/vmray/models.py
@@ -122,8 +122,8 @@ class FunctionReturn(BaseModel):
 
 
 class Analysis(BaseModel):
-    log_version: str # tested 2
-    analyzer_version: str # tested 2024.2.1
+    log_version: str  # tested 2
+    analyzer_version: str  # tested 2024.2.1
     # analysis_date: str
 
     function_calls: List[FunctionCall] = Field(alias="fncall", default=[])

From b68a91e10b68a84405fe05de787d110df6cad731 Mon Sep 17 00:00:00 2001
From: Mike Hunhoff <mike.hunhoff@gmail.com>
Date: Wed, 17 Jul 2024 12:06:23 -0600
Subject: [PATCH 058/105] vmray: validate supported flog version

---
 capa/features/extractors/vmray/__init__.py | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/capa/features/extractors/vmray/__init__.py b/capa/features/extractors/vmray/__init__.py
index ff50ef5d4..acb3b0e2a 100644
--- a/capa/features/extractors/vmray/__init__.py
+++ b/capa/features/extractors/vmray/__init__.py
@@ -22,6 +22,8 @@
 # TODO (meh): is default password "infected" good enough?? https://github.com/mandiant/capa/issues/2148
 DEFAULT_ARCHIVE_PASSWORD = b"infected"
 
+SUPPORTED_FLOG_VERSIONS = ("2",)
+
 
 class VMRayAnalysis:
     def __init__(self, zipfile_path: Path):
@@ -38,6 +40,12 @@ def __init__(self, zipfile_path: Path):
         # TODO (meh): we may need to validate support for the analysis version https://github.com/mandiant/capa/issues/2148
         self.flog = Flog.model_validate(flog_json)
 
+        if self.flog.analysis.log_version not in SUPPORTED_FLOG_VERSIONS:
+            logger.warning("VMRay feature extractor does not support flog version %s", self.flog.analysis.log_version)
+            raise UnsupportedFormatError(
+                "VMRay feature extractor does not support flog version %s", self.flog.analysis.log_version
+            )
+
         self.exports: Dict[int, str] = {}
         self.imports: Dict[int, Tuple[str, str]] = {}
         self.sections: Dict[int, str] = {}

From ec7e43193e9f4eb143ad0731664b82ab632b5f67 Mon Sep 17 00:00:00 2001
From: Mike Hunhoff <mike.hunhoff@gmail.com>
Date: Wed, 17 Jul 2024 12:10:18 -0600
Subject: [PATCH 059/105] vmray: update comment for extract_process_features

---
 capa/features/extractors/vmray/extractor.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/capa/features/extractors/vmray/extractor.py b/capa/features/extractors/vmray/extractor.py
index e1501d3e9..db27a918a 100644
--- a/capa/features/extractors/vmray/extractor.py
+++ b/capa/features/extractors/vmray/extractor.py
@@ -56,7 +56,7 @@ def get_processes(self) -> Iterator[ProcessHandle]:
         yield from capa.features.extractors.vmray.file.get_processes(self.analysis)
 
     def extract_process_features(self, ph: ProcessHandle) -> Iterator[Tuple[Feature, Address]]:
-        # TODO (meh): https://github.com/mandiant/capa/issues/2148
+        # we have not identified process-specific features for VMRay yet
         yield from []
 
     def get_process_name(self, ph) -> str:

From cc87ef39d59c2963cd85120e14b76da65fdca747 Mon Sep 17 00:00:00 2001
From: Mike Hunhoff <mike.hunhoff@gmail.com>
Date: Wed, 17 Jul 2024 12:18:01 -0600
Subject: [PATCH 060/105] vmray: remove and document extract_call_features
 comments

---
 capa/features/extractors/vmray/call.py | 7 -------
 1 file changed, 7 deletions(-)

diff --git a/capa/features/extractors/vmray/call.py b/capa/features/extractors/vmray/call.py
index 0ee43b7d3..2c39e2c4d 100644
--- a/capa/features/extractors/vmray/call.py
+++ b/capa/features/extractors/vmray/call.py
@@ -39,13 +39,6 @@ def extract_call_features(ph: ProcessHandle, th: ThreadHandle, ch: CallHandle) -
         for param in call.params_in.params:
             yield from get_call_param_features(param, ch)
 
-    """
-    # TODO (meh): params_out stores return value, not sure where to emit this?? https://github.com/mandiant/capa/issues/2148
-    if call.params_out:
-        for param in call.params_out.params:
-            yield from get_call_param_features(param, ch)
-    """
-
     yield API(call.name), ch.address
 
 

From 100df45cc055eea9fd2b5da77511dee701bd8ea6 Mon Sep 17 00:00:00 2001
From: Mike Hunhoff <mike.hunhoff@gmail.com>
Date: Wed, 17 Jul 2024 12:27:14 -0600
Subject: [PATCH 061/105] vmray: add logging for skipped deref param types

---
 capa/features/extractors/vmray/call.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/capa/features/extractors/vmray/call.py b/capa/features/extractors/vmray/call.py
index 2c39e2c4d..436b4bebb 100644
--- a/capa/features/extractors/vmray/call.py
+++ b/capa/features/extractors/vmray/call.py
@@ -27,6 +27,8 @@ def get_call_param_features(param: Param, ch: CallHandle) -> Iterator[Tuple[Feat
                 yield Number(hexint(param.deref.value)), ch.address
             elif param.deref.type_ in PARAM_TYPE_STR:
                 yield String(param.deref.value), ch.address
+            else:
+                logger.debug("skipping deref param type %s", param.deref.type_)
     elif param.value is not None:
         if param.type_ in PARAM_TYPE_INT:
             yield Number(hexint(param.value)), ch.address

From 19a6f3ad495a5e4a0d5fb1c0ecdfec6b02197c10 Mon Sep 17 00:00:00 2001
From: Mike Hunhoff <mike.hunhoff@gmail.com>
Date: Wed, 17 Jul 2024 12:37:51 -0600
Subject: [PATCH 062/105] vmray: improve supported file type validation

---
 capa/features/extractors/vmray/__init__.py  | 33 ++++++++++++++-------
 capa/features/extractors/vmray/extractor.py |  2 ++
 capa/features/extractors/vmray/global_.py   |  1 +
 3 files changed, 25 insertions(+), 11 deletions(-)

diff --git a/capa/features/extractors/vmray/__init__.py b/capa/features/extractors/vmray/__init__.py
index acb3b0e2a..2089f913a 100644
--- a/capa/features/extractors/vmray/__init__.py
+++ b/capa/features/extractors/vmray/__init__.py
@@ -7,7 +7,7 @@
 # See the License for the specific language governing permissions and limitations under the License.
 import json
 import logging
-from typing import Dict, List, Tuple
+from typing import Dict, List, Tuple, Optional
 from pathlib import Path
 from zipfile import ZipFile
 from collections import defaultdict
@@ -54,20 +54,18 @@ def __init__(self, zipfile_path: Path):
         self.process_calls: Dict[int, Dict[int, List[FunctionCall]]] = defaultdict(lambda: defaultdict(list))
         self.base_address: int
 
-        self.sample_file_name: str
-        self.sample_file_analysis: File
-        self.sample_file_static_data: StaticData
+        self.sample_file_name: Optional[str] = None
+        self.sample_file_analysis: Optional[File] = None
+        self.sample_file_static_data: Optional[StaticData] = None
 
         self._find_sample_file()
-        self._compute_base_address()
-        self._compute_imports()
-        self._compute_exports()
-        self._compute_sections()
-        self._compute_process_ids()
-        self._compute_process_threads()
-        self._compute_process_calls()
+
+        if self.sample_file_name is None or self.sample_file_analysis is None:
+            logger.warning("VMRay archive does not contain sample file")
+            raise UnsupportedFormatError("VMRay archive does not contain sample file")
 
         if not self.sample_file_static_data.pe:
+            logger.warning("VMRay feature extractor only supports PE at this time")
             raise UnsupportedFormatError("VMRay feature extractor only supports PE at this time")
 
         # VMRay does not store static strings for the sample file so we must use the source file
@@ -79,6 +77,15 @@ def __init__(self, zipfile_path: Path):
 
         self.sample_file_buf: bytes = self.zipfile.read(sample_file_path, pwd=DEFAULT_ARCHIVE_PASSWORD)
 
+        # only compute these if we've found a supported sample file type
+        self._compute_base_address()
+        self._compute_imports()
+        self._compute_exports()
+        self._compute_sections()
+        self._compute_process_ids()
+        self._compute_process_threads()
+        self._compute_process_calls()
+
     def _find_sample_file(self):
         for file_name, file_analysis in self.sv2.files.items():
             if file_analysis.is_sample:
@@ -92,21 +99,25 @@ def _find_sample_file(self):
                 break
 
     def _compute_base_address(self):
+        assert self.sample_file_static_data is not None
         if self.sample_file_static_data.pe:
             self.base_address = self.sample_file_static_data.pe.basic_info.image_base
 
     def _compute_exports(self):
+        assert self.sample_file_static_data is not None
         if self.sample_file_static_data.pe:
             for export in self.sample_file_static_data.pe.exports:
                 self.exports[export.address] = export.api.name
 
     def _compute_imports(self):
+        assert self.sample_file_static_data is not None
         if self.sample_file_static_data.pe:
             for module in self.sample_file_static_data.pe.imports:
                 for api in module.apis:
                     self.imports[api.address] = (module.dll, api.api.name)
 
     def _compute_sections(self):
+        assert self.sample_file_static_data is not None
         if self.sample_file_static_data.pe:
             for section in self.sample_file_static_data.pe.sections:
                 self.sections[section.virtual_address] = section.name
diff --git a/capa/features/extractors/vmray/extractor.py b/capa/features/extractors/vmray/extractor.py
index db27a918a..0e420836d 100644
--- a/capa/features/extractors/vmray/extractor.py
+++ b/capa/features/extractors/vmray/extractor.py
@@ -29,6 +29,8 @@
 
 class VMRayExtractor(DynamicFeatureExtractor):
     def __init__(self, analysis: VMRayAnalysis):
+        assert analysis.sample_file_analysis is not None
+
         super().__init__(
             hashes=SampleHashes(
                 md5=analysis.sample_file_analysis.hash_values.md5.lower(),
diff --git a/capa/features/extractors/vmray/global_.py b/capa/features/extractors/vmray/global_.py
index c2a7e3b78..69f91bf08 100644
--- a/capa/features/extractors/vmray/global_.py
+++ b/capa/features/extractors/vmray/global_.py
@@ -29,6 +29,7 @@ def extract_arch(analysis: VMRayAnalysis) -> Iterator[Tuple[Feature, Address]]:
 
 
 def extract_format(analysis: VMRayAnalysis) -> Iterator[Tuple[Feature, Address]]:
+    assert analysis.sample_file_static_data is not None
     if analysis.sample_file_static_data.pe:
         yield Format(FORMAT_PE), NO_ADDRESS
     else:

From 330c77a32aaa9a2e4c1a93bd2f470b39f13da2ff Mon Sep 17 00:00:00 2001
From: Mike Hunhoff <mike.hunhoff@gmail.com>
Date: Wed, 17 Jul 2024 15:04:00 -0600
Subject: [PATCH 063/105] vmray: implement get_call_name

---
 capa/features/extractors/vmray/extractor.py | 32 +++++++++++++++++++--
 capa/features/extractors/vmray/models.py    |  5 +---
 2 files changed, 30 insertions(+), 7 deletions(-)

diff --git a/capa/features/extractors/vmray/extractor.py b/capa/features/extractors/vmray/extractor.py
index 0e420836d..97f893d94 100644
--- a/capa/features/extractors/vmray/extractor.py
+++ b/capa/features/extractors/vmray/extractor.py
@@ -7,7 +7,7 @@
 # See the License for the specific language governing permissions and limitations under the License.
 
 
-from typing import Tuple, Iterator
+from typing import List, Tuple, Iterator
 from pathlib import Path
 
 import capa.helpers
@@ -17,7 +17,7 @@
 from capa.features.common import Feature, Characteristic
 from capa.features.address import NO_ADDRESS, Address, ThreadAddress, DynamicCallAddress, AbsoluteVirtualAddress
 from capa.features.extractors.vmray import VMRayAnalysis
-from capa.features.extractors.vmray.models import Process, FunctionCall
+from capa.features.extractors.vmray.models import PARAM_TYPE_STR, Process, ParamList, FunctionCall
 from capa.features.extractors.base_extractor import (
     CallHandle,
     SampleHashes,
@@ -27,6 +27,20 @@
 )
 
 
+def format_params(params: ParamList) -> List[str]:
+    params_list: List[str] = []
+
+    for param in params:
+        if param.deref and param.deref.value is not None:
+            deref_value: str = f'"{param.deref.value}"' if param.deref.type_ in PARAM_TYPE_STR else param.deref.value
+            params_list.append(f"{param.name}: {deref_value}")
+        else:
+            value: str = "" if param.value is None else param.value
+            params_list.append(f"{param.name}: {value}")
+
+    return params_list
+
+
 class VMRayExtractor(DynamicFeatureExtractor):
     def __init__(self, analysis: VMRayAnalysis):
         assert analysis.sample_file_analysis is not None
@@ -90,7 +104,19 @@ def extract_call_features(
 
     def get_call_name(self, ph, th, ch) -> str:
         call: FunctionCall = ch.inner
-        return call.name
+        call_formatted: str = call.name
+
+        # format input parameters
+        if call.params_in:
+            call_formatted += f"({', '.join(format_params(call.params_in.params))})"
+        else:
+            call_formatted += "()"
+
+        # format output parameters
+        if call.params_out:
+            call_formatted += f" -> {', '.join(format_params(call.params_out.params))}"
+
+        return call_formatted
 
     @classmethod
     def from_zipfile(cls, zipfile_path: Path):
diff --git a/capa/features/extractors/vmray/models.py b/capa/features/extractors/vmray/models.py
index e0e2fe2e5..4291e7d08 100644
--- a/capa/features/extractors/vmray/models.py
+++ b/capa/features/extractors/vmray/models.py
@@ -108,17 +108,14 @@ class FunctionCall(BaseModel):
     # addr: HexInt
     # from_addr: HexInt = Field(alias="from")
     params_in: Params = Field(alias="in", default=None)
-    # params_out: Params = Field(alias="out", default=None)
+    params_out: Params = Field(alias="out", default=None)
 
 
-"""
-# not useful for capa, but included for documentation in case
 class FunctionReturn(BaseModel):
     ts: HexInt
     fncall_id: HexInt
     addr: HexInt
     from_addr: HexInt = Field(alias="from")
-"""
 
 
 class Analysis(BaseModel):

From fd7bd94b488aeb8162059308e12180eee0f1d68b Mon Sep 17 00:00:00 2001
From: Mike Hunhoff <mike.hunhoff@gmail.com>
Date: Thu, 18 Jul 2024 08:50:20 -0600
Subject: [PATCH 064/105] vmray: remove outdated comments

---
 capa/features/extractors/vmray/extractor.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/capa/features/extractors/vmray/extractor.py b/capa/features/extractors/vmray/extractor.py
index 97f893d94..a8f3a79a4 100644
--- a/capa/features/extractors/vmray/extractor.py
+++ b/capa/features/extractors/vmray/extractor.py
@@ -76,7 +76,6 @@ def extract_process_features(self, ph: ProcessHandle) -> Iterator[Tuple[Feature,
         yield from []
 
     def get_process_name(self, ph) -> str:
-        # TODO (meh): bring to parity with cape sandbox extractor https://github.com/mandiant/capa/issues/2148
         process: Process = ph.inner
         return process.image_name
 

From 5afea2947394382f014a101f84d53ea3b8f26169 Mon Sep 17 00:00:00 2001
From: Mike Hunhoff <mike.hunhoff@gmail.com>
Date: Thu, 18 Jul 2024 09:06:58 -0600
Subject: [PATCH 065/105] vmray: update CHANGELOG release notes with VMRay
 integration

---
 CHANGELOG.md | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index f4a44fbdc..ff8f97596 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -2,6 +2,8 @@
 
 ## master (unreleased)
 
+Unlock powerful malware analysis with capa's new [VMRay sandbox](https://www.vmray.com/) integration! Simply provide a VMRay analysis archive, and capa will automatically extract and match capabilties, streamlining your workflow.
+
 ### New Features
 - dynamic: add support for VMRay dynamic sandbox traces #2208 @mike-hunhoff @r-sm2024 @mr-tz
 ### Breaking Changes

From 998537ddf801febb22dfbe314f603f9d340636b2 Mon Sep 17 00:00:00 2001
From: Mike Hunhoff <mike.hunhoff@gmail.com>
Date: Thu, 18 Jul 2024 09:10:50 -0600
Subject: [PATCH 066/105] vmray: remove outdated comments

---
 capa/features/extractors/vmray/__init__.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/capa/features/extractors/vmray/__init__.py b/capa/features/extractors/vmray/__init__.py
index 2089f913a..413272a43 100644
--- a/capa/features/extractors/vmray/__init__.py
+++ b/capa/features/extractors/vmray/__init__.py
@@ -37,7 +37,6 @@ def __init__(self, zipfile_path: Path):
         # flog.xml contains all of the call information that VMRay captured during execution
         flog_xml = self.zipfile.read("logs/flog.xml", pwd=DEFAULT_ARCHIVE_PASSWORD)
         flog_json = xmltodict.parse(flog_xml, attr_prefix="")
-        # TODO (meh): we may need to validate support for the analysis version https://github.com/mandiant/capa/issues/2148
         self.flog = Flog.model_validate(flog_json)
 
         if self.flog.analysis.log_version not in SUPPORTED_FLOG_VERSIONS:

From 64a09d31469939b86cbf05f95d27afd4657b4d86 Mon Sep 17 00:00:00 2001
From: Mike Hunhoff <mike.hunhoff@gmail.com>
Date: Thu, 18 Jul 2024 11:20:03 -0600
Subject: [PATCH 067/105] vmray: remove broken assert for unique OS PIDs

---
 capa/features/extractors/vmray/__init__.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/capa/features/extractors/vmray/__init__.py b/capa/features/extractors/vmray/__init__.py
index 413272a43..c268f85f2 100644
--- a/capa/features/extractors/vmray/__init__.py
+++ b/capa/features/extractors/vmray/__init__.py
@@ -123,9 +123,8 @@ def _compute_sections(self):
 
     def _compute_process_ids(self):
         for process in self.sv2.processes.values():
+            # we expect VMRay's monitor IDs to be unique, but OS PIDs may be reused
             assert process.monitor_id not in self.process_ids.keys()
-            assert process.os_pid not in self.process_ids.values()
-
             self.process_ids[process.monitor_id] = process.os_pid
 
     def _compute_process_threads(self):

From 6f7cc7cdb0142639d225e273ea42ce888d0e6a54 Mon Sep 17 00:00:00 2001
From: Mike Hunhoff <mike.hunhoff@gmail.com>
Date: Thu, 18 Jul 2024 11:33:42 -0600
Subject: [PATCH 068/105] vmray: improve detections for unsupported input files

---
 capa/features/extractors/vmray/__init__.py | 17 ++++++++++++-----
 1 file changed, 12 insertions(+), 5 deletions(-)

diff --git a/capa/features/extractors/vmray/__init__.py b/capa/features/extractors/vmray/__init__.py
index c268f85f2..afe00cbc9 100644
--- a/capa/features/extractors/vmray/__init__.py
+++ b/capa/features/extractors/vmray/__init__.py
@@ -33,6 +33,7 @@ def __init__(self, zipfile_path: Path):
         # we use its data to find everything else that we need for capa
         sv2_json = json.loads(self.zipfile.read("logs/summary_v2.json", pwd=DEFAULT_ARCHIVE_PASSWORD))
         self.sv2 = SummaryV2.model_validate(sv2_json)
+        self.file_type: str = self.sv2.analysis_metadata.sample_type
 
         # flog.xml contains all of the call information that VMRay captured during execution
         flog_xml = self.zipfile.read("logs/flog.xml", pwd=DEFAULT_ARCHIVE_PASSWORD)
@@ -60,19 +61,25 @@ def __init__(self, zipfile_path: Path):
         self._find_sample_file()
 
         if self.sample_file_name is None or self.sample_file_analysis is None:
-            logger.warning("VMRay archive does not contain sample file")
-            raise UnsupportedFormatError("VMRay archive does not contain sample file")
+            logger.warning("VMRay archive does not contain sample file (file_type: %s)", self.file_type)
+            raise UnsupportedFormatError("VMRay archive does not contain sample file (file_type: %s)", self.file_type)
+
+        if not self.sample_file_static_data:
+            logger.warning("VMRay archive does not contain static data (file_type: %s)", self.file_type)
+            raise UnsupportedFormatError("VMRay archive does not contain static data (file_type: %s)", self.file_type)
 
         if not self.sample_file_static_data.pe:
-            logger.warning("VMRay feature extractor only supports PE at this time")
-            raise UnsupportedFormatError("VMRay feature extractor only supports PE at this time")
+            logger.warning("VMRay feature extractor only supports PE at this time (file_type: %s)", self.file_type)
+            raise UnsupportedFormatError(
+                "VMRay feature extractor only supports PE at this time(file_type: %s)", self.file_type
+            )
 
         # VMRay does not store static strings for the sample file so we must use the source file
         # stored in the archive
         sample_sha256: str = self.sample_file_analysis.hash_values.sha256.lower()
         sample_file_path: str = f"internal/static_analyses/{sample_sha256}/objects/files/{sample_sha256}"
 
-        logger.debug("sample file path: %s", sample_file_path)
+        logger.debug("file_type: %s, file_path: %s", self.file_type, sample_file_path)
 
         self.sample_file_buf: bytes = self.zipfile.read(sample_file_path, pwd=DEFAULT_ARCHIVE_PASSWORD)
 

From 24a31a8bc30878c5df39fbf3244cde53ad07f7a8 Mon Sep 17 00:00:00 2001
From: Mike Hunhoff <mike.hunhoff@gmail.com>
Date: Thu, 18 Jul 2024 14:23:20 -0600
Subject: [PATCH 069/105] vmray: add comments to __init__.py

---
 capa/features/extractors/vmray/__init__.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/capa/features/extractors/vmray/__init__.py b/capa/features/extractors/vmray/__init__.py
index afe00cbc9..27906d839 100644
--- a/capa/features/extractors/vmray/__init__.py
+++ b/capa/features/extractors/vmray/__init__.py
@@ -65,6 +65,7 @@ def __init__(self, zipfile_path: Path):
             raise UnsupportedFormatError("VMRay archive does not contain sample file (file_type: %s)", self.file_type)
 
         if not self.sample_file_static_data:
+            # we see this for text files e.g. JScript file types
             logger.warning("VMRay archive does not contain static data (file_type: %s)", self.file_type)
             raise UnsupportedFormatError("VMRay archive does not contain static data (file_type: %s)", self.file_type)
 

From 8bf0d16fd8008c6a3c28f06279065bc120f11b9e Mon Sep 17 00:00:00 2001
From: Mike Hunhoff <mike.hunhoff@gmail.com>
Date: Thu, 18 Jul 2024 17:52:33 -0600
Subject: [PATCH 070/105] vmray: add init support for ELF files

---
 capa/features/extractors/vmray/__init__.py | 15 +++++---
 capa/features/extractors/vmray/global_.py  | 17 ++++++++-
 capa/features/extractors/vmray/models.py   | 40 +++++++++++++++++++++-
 3 files changed, 65 insertions(+), 7 deletions(-)

diff --git a/capa/features/extractors/vmray/__init__.py b/capa/features/extractors/vmray/__init__.py
index 27906d839..141a2595b 100644
--- a/capa/features/extractors/vmray/__init__.py
+++ b/capa/features/extractors/vmray/__init__.py
@@ -69,10 +69,12 @@ def __init__(self, zipfile_path: Path):
             logger.warning("VMRay archive does not contain static data (file_type: %s)", self.file_type)
             raise UnsupportedFormatError("VMRay archive does not contain static data (file_type: %s)", self.file_type)
 
-        if not self.sample_file_static_data.pe:
-            logger.warning("VMRay feature extractor only supports PE at this time (file_type: %s)", self.file_type)
+        if not self.sample_file_static_data.pe and not self.sample_file_static_data.elf:
+            logger.warning(
+                "VMRay feature extractor only supports PE and ELF at this time (file_type: %s)", self.file_type
+            )
             raise UnsupportedFormatError(
-                "VMRay feature extractor only supports PE at this time(file_type: %s)", self.file_type
+                "VMRay feature extractor only supports PE and ELF at this time(file_type: %s)", self.file_type
             )
 
         # VMRay does not store static strings for the sample file so we must use the source file
@@ -126,8 +128,11 @@ def _compute_imports(self):
     def _compute_sections(self):
         assert self.sample_file_static_data is not None
         if self.sample_file_static_data.pe:
-            for section in self.sample_file_static_data.pe.sections:
-                self.sections[section.virtual_address] = section.name
+            for pefile_section in self.sample_file_static_data.pe.sections:
+                self.sections[pefile_section.virtual_address] = pefile_section.name
+        elif self.sample_file_static_data.elf:
+            for elffile_section in self.sample_file_static_data.elf.sections:
+                self.sections[elffile_section.header.sh_addr] = elffile_section.header.sh_name
 
     def _compute_process_ids(self):
         for process in self.sv2.processes.values():
diff --git a/capa/features/extractors/vmray/global_.py b/capa/features/extractors/vmray/global_.py
index 69f91bf08..82ab24585 100644
--- a/capa/features/extractors/vmray/global_.py
+++ b/capa/features/extractors/vmray/global_.py
@@ -9,7 +9,18 @@
 import logging
 from typing import Tuple, Iterator
 
-from capa.features.common import OS, ARCH_I386, FORMAT_PE, ARCH_AMD64, OS_WINDOWS, Arch, Format, Feature
+from capa.features.common import (
+    OS,
+    OS_LINUX,
+    ARCH_I386,
+    FORMAT_PE,
+    ARCH_AMD64,
+    FORMAT_ELF,
+    OS_WINDOWS,
+    Arch,
+    Format,
+    Feature,
+)
 from capa.features.address import NO_ADDRESS, Address
 from capa.features.extractors.vmray import VMRayAnalysis
 
@@ -32,6 +43,8 @@ def extract_format(analysis: VMRayAnalysis) -> Iterator[Tuple[Feature, Address]]
     assert analysis.sample_file_static_data is not None
     if analysis.sample_file_static_data.pe:
         yield Format(FORMAT_PE), NO_ADDRESS
+    elif analysis.sample_file_static_data.elf:
+        yield Format(FORMAT_ELF), NO_ADDRESS
     else:
         logger.warning("unrecognized file format: %s", analysis.sv2.analysis_metadata.sample_type)
         raise ValueError(
@@ -44,6 +57,8 @@ def extract_os(analysis: VMRayAnalysis) -> Iterator[Tuple[Feature, Address]]:
 
     if "windows" in sample_type.lower():
         yield OS(OS_WINDOWS), NO_ADDRESS
+    elif "linux" in sample_type.lower():
+        yield OS(OS_LINUX), NO_ADDRESS
     else:
         logger.warning("unrecognized OS: %s", sample_type)
         raise ValueError(f"unrecognized OS from the VMRay report: {sample_type}")
diff --git a/capa/features/extractors/vmray/models.py b/capa/features/extractors/vmray/models.py
index 4291e7d08..9d2bd2713 100644
--- a/capa/features/extractors/vmray/models.py
+++ b/capa/features/extractors/vmray/models.py
@@ -72,6 +72,13 @@ def validate_param_list(value):
         return [value]
 
 
+def validate_call_name(value):
+    if value.startswith("sys_"):
+        return value[4:]
+    else:
+        return value
+
+
 # convert the input value to a Python int type before inner validation (int) is called
 HexInt = Annotated[int, BeforeValidator(validate_hex_int)]
 
@@ -98,13 +105,18 @@ class Params(BaseModel):
     params: ParamList = Field(alias="param")
 
 
+# call names may contain uneeded data so we remove that data before
+# the inner validation (str) is called
+CallName = Annotated[str, BeforeValidator(validate_call_name)]
+
+
 # models flog.xml files
 class FunctionCall(BaseModel):
     # ts: HexInt
     fncall_id: HexInt
     process_id: HexInt
     thread_id: HexInt
-    name: str
+    name: CallName
     # addr: HexInt
     # from_addr: HexInt = Field(alias="from")
     params_in: Params = Field(alias="in", default=None)
@@ -193,8 +205,34 @@ class PEFile(BaseModel):
     sections: List[PEFileSection] = []
 
 
+class ElfFileSectionHeader(BaseModel):
+    sh_name: str
+    sh_addr: int
+
+
+class ElfFileSection(BaseModel):
+    header: ElfFileSectionHeader
+
+
+"""
+class ElfFileHeader(BaseModel):
+    file_class: str
+    endianness: str
+    file_type: str
+    architecture: str
+    architecture_human_str: str
+    entry_point: int
+"""
+
+
+class ElfFile(BaseModel):
+    # file_header: ElfFileHeader
+    sections: List[ElfFileSection]
+
+
 class StaticData(BaseModel):
     pe: Optional[PEFile] = None
+    elf: Optional[ElfFile] = None
 
 
 class FileHashes(BaseModel):

From 6e0dc83451e6c3d3281a20735bcf87a00a72cc07 Mon Sep 17 00:00:00 2001
From: Mike Hunhoff <mike.hunhoff@gmail.com>
Date: Fri, 19 Jul 2024 11:51:16 -0600
Subject: [PATCH 071/105] vmray: refactor global_.py

---
 capa/features/extractors/vmray/global_.py | 26 +++++++++++------------
 1 file changed, 12 insertions(+), 14 deletions(-)

diff --git a/capa/features/extractors/vmray/global_.py b/capa/features/extractors/vmray/global_.py
index 82ab24585..95f7cc90d 100644
--- a/capa/features/extractors/vmray/global_.py
+++ b/capa/features/extractors/vmray/global_.py
@@ -28,15 +28,15 @@
 
 
 def extract_arch(analysis: VMRayAnalysis) -> Iterator[Tuple[Feature, Address]]:
-    sample_type: str = analysis.sv2.analysis_metadata.sample_type
+    file_type: str = analysis.file_type
 
-    if "x86-32" in sample_type:
+    if "x86-32" in file_type:
         yield Arch(ARCH_I386), NO_ADDRESS
-    elif "x86-64" in sample_type:
+    elif "x86-64" in file_type:
         yield Arch(ARCH_AMD64), NO_ADDRESS
     else:
-        logger.warning("unrecognized arch: %s", sample_type)
-        raise ValueError(f"unrecognized arch from the VMRay report: {sample_type}")
+        logger.warning("unrecognized arch: %s", file_type)
+        raise ValueError(f"unrecognized arch from the VMRay report: {file_type}")
 
 
 def extract_format(analysis: VMRayAnalysis) -> Iterator[Tuple[Feature, Address]]:
@@ -46,22 +46,20 @@ def extract_format(analysis: VMRayAnalysis) -> Iterator[Tuple[Feature, Address]]
     elif analysis.sample_file_static_data.elf:
         yield Format(FORMAT_ELF), NO_ADDRESS
     else:
-        logger.warning("unrecognized file format: %s", analysis.sv2.analysis_metadata.sample_type)
-        raise ValueError(
-            f"unrecognized file format from the VMRay report: {analysis.sv2.analysis_metadata.sample_type}"
-        )
+        logger.warning("unrecognized file format: %s", analysis.file_type)
+        raise ValueError(f"unrecognized file format from the VMRay report: {analysis.file_type}")
 
 
 def extract_os(analysis: VMRayAnalysis) -> Iterator[Tuple[Feature, Address]]:
-    sample_type: str = analysis.sv2.analysis_metadata.sample_type
+    file_type: str = analysis.file_type
 
-    if "windows" in sample_type.lower():
+    if "windows" in file_type.lower():
         yield OS(OS_WINDOWS), NO_ADDRESS
-    elif "linux" in sample_type.lower():
+    elif "linux" in file_type.lower():
         yield OS(OS_LINUX), NO_ADDRESS
     else:
-        logger.warning("unrecognized OS: %s", sample_type)
-        raise ValueError(f"unrecognized OS from the VMRay report: {sample_type}")
+        logger.warning("unrecognized OS: %s", file_type)
+        raise ValueError(f"unrecognized OS from the VMRay report: {file_type}")
 
 
 def extract_features(analysis: VMRayAnalysis) -> Iterator[Tuple[Feature, Address]]:

From 673f7cccfc30d73f75025179821fb0b4a430398b Mon Sep 17 00:00:00 2001
From: Mike Hunhoff <mike.hunhoff@gmail.com>
Date: Fri, 19 Jul 2024 11:57:07 -0600
Subject: [PATCH 072/105] vmray: refactor models.py

---
 capa/features/extractors/vmray/models.py | 30 ++++++++++++------------
 1 file changed, 15 insertions(+), 15 deletions(-)

diff --git a/capa/features/extractors/vmray/models.py b/capa/features/extractors/vmray/models.py
index 9d2bd2713..d7a06c3c9 100644
--- a/capa/features/extractors/vmray/models.py
+++ b/capa/features/extractors/vmray/models.py
@@ -65,24 +65,11 @@ def validate_hex_int(value: Union[str, int]) -> int:
     return hexint(value)
 
 
-def validate_param_list(value):
-    if isinstance(value, list):
-        return value
-    else:
-        return [value]
-
-
-def validate_call_name(value):
-    if value.startswith("sys_"):
-        return value[4:]
-    else:
-        return value
-
-
 # convert the input value to a Python int type before inner validation (int) is called
 HexInt = Annotated[int, BeforeValidator(validate_hex_int)]
 
 
+# models for summary_v2.json files
 class ParamDeref(BaseModel):
     type_: str = Field(alias="type")
     value: Optional[str] = None
@@ -95,6 +82,13 @@ class Param(BaseModel):
     deref: Optional[ParamDeref] = None
 
 
+def validate_param_list(value: Union[List[Param], Param]) -> List[Param]:
+    if isinstance(value, list):
+        return value
+    else:
+        return [value]
+
+
 # params may be stored as a list of Param or a single Param so we convert
 # the input value to Python list type before the inner validation (List[Param])
 # is called
@@ -105,6 +99,13 @@ class Params(BaseModel):
     params: ParamList = Field(alias="param")
 
 
+def validate_call_name(value: str) -> str:
+    if value.startswith("sys_"):
+        return value[4:]
+    else:
+        return value
+
+
 # call names may contain uneeded data so we remove that data before
 # the inner validation (str) is called
 CallName = Annotated[str, BeforeValidator(validate_call_name)]
@@ -143,7 +144,6 @@ class Flog(BaseModel):
     analysis: Analysis
 
 
-# models for summary_v2.json files
 class GenericReference(BaseModel):
     path: List[str]
     source: str

From 658927c103315fec4a077ec55a139983c10391f1 Mon Sep 17 00:00:00 2001
From: Mike Hunhoff <mike.hunhoff@gmail.com>
Date: Fri, 19 Jul 2024 11:58:48 -0600
Subject: [PATCH 073/105] vmray: refactor models.py

---
 capa/features/extractors/vmray/models.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/capa/features/extractors/vmray/models.py b/capa/features/extractors/vmray/models.py
index d7a06c3c9..6d61ce158 100644
--- a/capa/features/extractors/vmray/models.py
+++ b/capa/features/extractors/vmray/models.py
@@ -69,7 +69,7 @@ def validate_hex_int(value: Union[str, int]) -> int:
 HexInt = Annotated[int, BeforeValidator(validate_hex_int)]
 
 
-# models for summary_v2.json files
+# models flog.xml file
 class ParamDeref(BaseModel):
     type_: str = Field(alias="type")
     value: Optional[str] = None
@@ -111,7 +111,6 @@ def validate_call_name(value: str) -> str:
 CallName = Annotated[str, BeforeValidator(validate_call_name)]
 
 
-# models flog.xml files
 class FunctionCall(BaseModel):
     # ts: HexInt
     fncall_id: HexInt
@@ -144,6 +143,7 @@ class Flog(BaseModel):
     analysis: Analysis
 
 
+# models for summary_v2.json file
 class GenericReference(BaseModel):
     path: List[str]
     source: str

From 28792ec6a69be2f96b981485f04f1c4288eb86c7 Mon Sep 17 00:00:00 2001
From: Mike Hunhoff <mike.hunhoff@gmail.com>
Date: Fri, 19 Jul 2024 13:56:46 -0600
Subject: [PATCH 074/105] vmray: add model tests for FunctionCall

---
 capa/features/extractors/vmray/__init__.py |  8 ++-
 capa/features/extractors/vmray/models.py   |  5 ++
 tests/test_vmray_model.py                  | 59 ++++++++++++++++++++++
 3 files changed, 67 insertions(+), 5 deletions(-)
 create mode 100644 tests/test_vmray_model.py

diff --git a/capa/features/extractors/vmray/__init__.py b/capa/features/extractors/vmray/__init__.py
index 141a2595b..c605c7187 100644
--- a/capa/features/extractors/vmray/__init__.py
+++ b/capa/features/extractors/vmray/__init__.py
@@ -12,10 +12,8 @@
 from zipfile import ZipFile
 from collections import defaultdict
 
-import xmltodict
-
 from capa.exceptions import UnsupportedFormatError
-from capa.features.extractors.vmray.models import File, Flog, SummaryV2, StaticData, FunctionCall
+from capa.features.extractors.vmray.models import File, Flog, SummaryV2, StaticData, FunctionCall, xml_to_dict
 
 logger = logging.getLogger(__name__)
 
@@ -37,8 +35,8 @@ def __init__(self, zipfile_path: Path):
 
         # flog.xml contains all of the call information that VMRay captured during execution
         flog_xml = self.zipfile.read("logs/flog.xml", pwd=DEFAULT_ARCHIVE_PASSWORD)
-        flog_json = xmltodict.parse(flog_xml, attr_prefix="")
-        self.flog = Flog.model_validate(flog_json)
+        flog_dict = xml_to_dict(flog_xml)
+        self.flog = Flog.model_validate(flog_dict)
 
         if self.flog.analysis.log_version not in SUPPORTED_FLOG_VERSIONS:
             logger.warning("VMRay feature extractor does not support flog version %s", self.flog.analysis.log_version)
diff --git a/capa/features/extractors/vmray/models.py b/capa/features/extractors/vmray/models.py
index 6d61ce158..6f9997444 100644
--- a/capa/features/extractors/vmray/models.py
+++ b/capa/features/extractors/vmray/models.py
@@ -8,6 +8,7 @@
 
 from typing import Dict, List, Union, Optional
 
+import xmltodict
 from pydantic import Field, BaseModel
 from typing_extensions import Annotated
 from pydantic.functional_validators import BeforeValidator
@@ -54,6 +55,10 @@
 )
 
 
+def xml_to_dict(xml):
+    return xmltodict.parse(xml, attr_prefix="")
+
+
 def hexint(value: Union[str, int]) -> int:
     if isinstance(value, str):
         return int(value, 16) if value.startswith("0x") else int(value, 10)
diff --git a/tests/test_vmray_model.py b/tests/test_vmray_model.py
new file mode 100644
index 000000000..0b0d86bf3
--- /dev/null
+++ b/tests/test_vmray_model.py
@@ -0,0 +1,59 @@
+# Copyright (C) 2024 Mandiant, Inc. All Rights Reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at: [package root]/LICENSE.txt
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+#  is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and limitations under the License.
+import textwrap
+
+from capa.features.extractors.vmray.models import Param, FunctionCall, xml_to_dict
+
+
+def test_vmray_model_call():
+    call_xml = textwrap.dedent(
+        """
+        <fncall ts="9044" fncall_id="18" process_id="1" thread_id="1" name="sys_time" addr="0xaaaaaaaaaaaaaaaa" from="0xaaaaaaaa">
+            <kernel/>
+            <in>
+                <param name="tloc" type="unknown" value="0x0"/>
+            </in>
+            <out>
+                <param name="ret_val" type="unknown" value="0xaaaaaaaa"/>
+            </out>
+        </fncall>
+        """
+    )
+    call: FunctionCall = FunctionCall.model_validate(xml_to_dict(call_xml)["fncall"])
+
+    assert call.fncall_id == 18
+    assert call.process_id == 1
+    assert call.thread_id == 1
+    assert call.name == "time"
+    assert call.params_in is not None
+    assert call.params_out is not None
+
+
+def test_vmray_model_call_param():
+    param_xml = textwrap.dedent(
+        """
+        <param name="addrlen" type="signed_32bit" value="16"/>
+        """
+    )
+    param: Param = Param.model_validate(xml_to_dict(param_xml)["param"])
+
+    assert param.value == "16"
+
+
+def test_vmray_model_call_param_deref():
+    param_xml = textwrap.dedent(
+        """
+        <param name="buf" type="ptr" value="0xaaaaaaaa">
+            <deref type="str" value="Hello world"/>
+        </param>
+        """
+    )
+    param: Param = Param.model_validate(xml_to_dict(param_xml)["param"])
+
+    assert param.deref is not None
+    assert param.deref.value == "Hello world"

From 2ba2a2b013648b3be645ae5f93e1c6f2302344ff Mon Sep 17 00:00:00 2001
From: Mike Hunhoff <mike.hunhoff@gmail.com>
Date: Fri, 19 Jul 2024 15:05:21 -0600
Subject: [PATCH 075/105] vmray: remove unneeded json.loads from __init__.py

---
 capa/features/extractors/vmray/__init__.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/capa/features/extractors/vmray/__init__.py b/capa/features/extractors/vmray/__init__.py
index c605c7187..5c27f83ad 100644
--- a/capa/features/extractors/vmray/__init__.py
+++ b/capa/features/extractors/vmray/__init__.py
@@ -5,7 +5,6 @@
 # Unless required by applicable law or agreed to in writing, software distributed under the License
 #  is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and limitations under the License.
-import json
 import logging
 from typing import Dict, List, Tuple, Optional
 from pathlib import Path
@@ -29,8 +28,9 @@ def __init__(self, zipfile_path: Path):
 
         # summary_v2.json is the entry point to the entire VMRay archive and
         # we use its data to find everything else that we need for capa
-        sv2_json = json.loads(self.zipfile.read("logs/summary_v2.json", pwd=DEFAULT_ARCHIVE_PASSWORD))
-        self.sv2 = SummaryV2.model_validate(sv2_json)
+        self.sv2 = SummaryV2.model_validate_json(
+            self.zipfile.read("logs/summary_v2.json", pwd=DEFAULT_ARCHIVE_PASSWORD)
+        )
         self.file_type: str = self.sv2.analysis_metadata.sample_type
 
         # flog.xml contains all of the call information that VMRay captured during execution

From 4490097e11f8fe9378526f93d7486765633462fd Mon Sep 17 00:00:00 2001
From: Mike Hunhoff <mike.hunhoff@gmail.com>
Date: Fri, 19 Jul 2024 15:28:47 -0600
Subject: [PATCH 076/105] vmray: add summary_v2.json model tests

---
 capa/features/extractors/vmray/models.py |   4 +-
 tests/test_vmray_model.py                | 106 ++++++++++++++++++++---
 2 files changed, 96 insertions(+), 14 deletions(-)

diff --git a/capa/features/extractors/vmray/models.py b/capa/features/extractors/vmray/models.py
index 6f9997444..47342b5e7 100644
--- a/capa/features/extractors/vmray/models.py
+++ b/capa/features/extractors/vmray/models.py
@@ -159,9 +159,9 @@ class StaticDataReference(GenericReference): ...
 
 class PEFileBasicInfo(BaseModel):
     # compile_time: str
-    file_type: str
+    # file_type: str
     image_base: int
-    machine_type: str
+    # machine_type: str
     # size_of_code: int
     # size_of_initialized_data: int
     # size_of_uninitialized_data: int
diff --git a/tests/test_vmray_model.py b/tests/test_vmray_model.py
index 0b0d86bf3..3b65b5637 100644
--- a/tests/test_vmray_model.py
+++ b/tests/test_vmray_model.py
@@ -7,14 +7,13 @@
 # See the License for the specific language governing permissions and limitations under the License.
 import textwrap
 
-from capa.features.extractors.vmray.models import Param, FunctionCall, xml_to_dict
+from capa.features.extractors.vmray.models import Param, PEFile, ElfFile, FunctionCall, AnalysisMetadata, xml_to_dict
 
 
-def test_vmray_model_call():
-    call_xml = textwrap.dedent(
+def test_vmray_model_function_call():
+    param_str = textwrap.dedent(
         """
-        <fncall ts="9044" fncall_id="18" process_id="1" thread_id="1" name="sys_time" addr="0xaaaaaaaaaaaaaaaa" from="0xaaaaaaaa">
-            <kernel/>
+        <fncall fncall_id="18" process_id="1" thread_id="1" name="sys_time">
             <in>
                 <param name="tloc" type="unknown" value="0x0"/>
             </in>
@@ -24,7 +23,7 @@ def test_vmray_model_call():
         </fncall>
         """
     )
-    call: FunctionCall = FunctionCall.model_validate(xml_to_dict(call_xml)["fncall"])
+    call: FunctionCall = FunctionCall.model_validate(xml_to_dict(param_str)["fncall"])
 
     assert call.fncall_id == 18
     assert call.process_id == 1
@@ -34,26 +33,109 @@ def test_vmray_model_call():
     assert call.params_out is not None
 
 
-def test_vmray_model_call_param():
-    param_xml = textwrap.dedent(
+def test_vmray_model_param():
+    param_str = textwrap.dedent(
         """
         <param name="addrlen" type="signed_32bit" value="16"/>
         """
     )
-    param: Param = Param.model_validate(xml_to_dict(param_xml)["param"])
+    param: Param = Param.model_validate(xml_to_dict(param_str)["param"])
 
     assert param.value == "16"
 
 
-def test_vmray_model_call_param_deref():
-    param_xml = textwrap.dedent(
+def test_vmray_model_param_deref():
+    param_str = textwrap.dedent(
         """
         <param name="buf" type="ptr" value="0xaaaaaaaa">
             <deref type="str" value="Hello world"/>
         </param>
         """
     )
-    param: Param = Param.model_validate(xml_to_dict(param_xml)["param"])
+    param: Param = Param.model_validate(xml_to_dict(param_str)["param"])
 
     assert param.deref is not None
     assert param.deref.value == "Hello world"
+
+
+def test_vmray_model_analysis_metadata():
+    analysis_metadata: AnalysisMetadata = AnalysisMetadata.model_validate_json(
+        """
+        {
+            "sample_type": "Linux ELF Executable (x86-64)",
+            "submission_filename": "abcd1234"
+        }
+        """
+    )
+
+    assert analysis_metadata.sample_type == "Linux ELF Executable (x86-64)"
+    assert analysis_metadata.submission_filename == "abcd1234"
+
+
+def test_vmray_model_elffile():
+    elffile: ElfFile = ElfFile.model_validate_json(
+        """
+        {
+            "sections": [
+                {
+                    "header": {
+                        "sh_name": "abcd1234",
+                        "sh_addr": 2863311530
+                    }
+                }
+            ]
+        }
+        """
+    )
+
+    assert elffile.sections is not None
+    assert elffile.sections[0].header is not None
+    assert elffile.sections[0].header.sh_name == "abcd1234"
+    assert elffile.sections[0].header.sh_addr == 2863311530
+
+
+def test_vmray_model_pefile():
+    pefile: PEFile = PEFile.model_validate_json(
+        """
+        {
+            "basic_info": {
+                "image_base": 2863311530
+            },
+            "imports": [
+            {
+                "apis": [
+                    {
+                        "address": 2863311530,
+                        "api": {
+                            "name": "Sleep"
+                        }
+                    }
+                ],
+                "dll": "KERNEL32.dll"
+                }
+            ],
+            "sections": [
+                {
+                    "name": ".text",
+                    "virtual_address": 2863311530
+                }
+            ],
+            "exports": [
+                {
+                    "api": {
+                        "name": "HellWorld",
+                        "ordinal": 10
+                    },
+                    "address": 2863311530
+                }
+            ]
+        }
+        """
+    )
+
+    assert pefile.basic_info.image_base == 2863311530
+    assert pefile.imports[0].dll == "KERNEL32.dll"
+    assert pefile.imports[0].apis[0].address == 2863311530
+    assert pefile.imports[0].apis[0].api.name == "Sleep"
+    assert pefile.sections[0].name == ".text"
+    assert pefile.sections[0].virtual_address == 2863311530

From 98939f8a8f0283f23318893a04b714f243f64116 Mon Sep 17 00:00:00 2001
From: Mike Hunhoff <mike.hunhoff@gmail.com>
Date: Fri, 19 Jul 2024 15:38:26 -0600
Subject: [PATCH 077/105] vmray: improve FunctionCall model

---
 capa/features/extractors/vmray/models.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/capa/features/extractors/vmray/models.py b/capa/features/extractors/vmray/models.py
index 47342b5e7..63ef3b273 100644
--- a/capa/features/extractors/vmray/models.py
+++ b/capa/features/extractors/vmray/models.py
@@ -124,8 +124,8 @@ class FunctionCall(BaseModel):
     name: CallName
     # addr: HexInt
     # from_addr: HexInt = Field(alias="from")
-    params_in: Params = Field(alias="in", default=None)
-    params_out: Params = Field(alias="out", default=None)
+    params_in: Optional[Params] = Field(alias="in", default=None)
+    params_out: Optional[Params] = Field(alias="out", default=None)
 
 
 class FunctionReturn(BaseModel):

From 4dfc53a58f676b01c7771a22e881b8e394b157c9 Mon Sep 17 00:00:00 2001
From: Mike Hunhoff <mike.hunhoff@gmail.com>
Date: Fri, 19 Jul 2024 15:42:04 -0600
Subject: [PATCH 078/105] vmray: refactor model tests

---
 tests/test_vmray_model.py | 67 +++++++++++++++++++++++----------------
 1 file changed, 40 insertions(+), 27 deletions(-)

diff --git a/tests/test_vmray_model.py b/tests/test_vmray_model.py
index 3b65b5637..79e234970 100644
--- a/tests/test_vmray_model.py
+++ b/tests/test_vmray_model.py
@@ -7,30 +7,15 @@
 # See the License for the specific language governing permissions and limitations under the License.
 import textwrap
 
-from capa.features.extractors.vmray.models import Param, PEFile, ElfFile, FunctionCall, AnalysisMetadata, xml_to_dict
-
-
-def test_vmray_model_function_call():
-    param_str = textwrap.dedent(
-        """
-        <fncall fncall_id="18" process_id="1" thread_id="1" name="sys_time">
-            <in>
-                <param name="tloc" type="unknown" value="0x0"/>
-            </in>
-            <out>
-                <param name="ret_val" type="unknown" value="0xaaaaaaaa"/>
-            </out>
-        </fncall>
-        """
-    )
-    call: FunctionCall = FunctionCall.model_validate(xml_to_dict(param_str)["fncall"])
-
-    assert call.fncall_id == 18
-    assert call.process_id == 1
-    assert call.thread_id == 1
-    assert call.name == "time"
-    assert call.params_in is not None
-    assert call.params_out is not None
+from capa.features.extractors.vmray.models import (
+    Param,
+    PEFile,
+    ElfFile,
+    FunctionCall,
+    AnalysisMetadata,
+    hexint,
+    xml_to_dict,
+)
 
 
 def test_vmray_model_param():
@@ -41,7 +26,8 @@ def test_vmray_model_param():
     )
     param: Param = Param.model_validate(xml_to_dict(param_str)["param"])
 
-    assert param.value == "16"
+    assert param.value is not None
+    assert hexint(param.value) == 16
 
 
 def test_vmray_model_param_deref():
@@ -58,6 +44,35 @@ def test_vmray_model_param_deref():
     assert param.deref.value == "Hello world"
 
 
+def test_vmray_model_function_call():
+    function_call_str = textwrap.dedent(
+        """
+        <fncall fncall_id="18" process_id="1" thread_id="1" name="sys_time">
+            <in>
+                <param name="tloc" type="unknown" value="0x0"/>
+            </in>
+            <out>
+                <param name="ret_val" type="unknown" value="0xaaaaaaaa"/>
+            </out>
+        </fncall>
+        """
+    )
+    function_call: FunctionCall = FunctionCall.model_validate(xml_to_dict(function_call_str)["fncall"])
+
+    assert function_call.fncall_id == 18
+    assert function_call.process_id == 1
+    assert function_call.thread_id == 1
+    assert function_call.name == "time"
+
+    assert function_call.params_in is not None
+    assert function_call.params_in.params[0].value is not None
+    assert hexint(function_call.params_in.params[0].value) == 0
+
+    assert function_call.params_out is not None
+    assert function_call.params_out.params[0].value is not None
+    assert hexint(function_call.params_out.params[0].value) == 2863311530
+
+
 def test_vmray_model_analysis_metadata():
     analysis_metadata: AnalysisMetadata = AnalysisMetadata.model_validate_json(
         """
@@ -88,8 +103,6 @@ def test_vmray_model_elffile():
         """
     )
 
-    assert elffile.sections is not None
-    assert elffile.sections[0].header is not None
     assert elffile.sections[0].header.sh_name == "abcd1234"
     assert elffile.sections[0].header.sh_addr == 2863311530
 

From 6ef485f67b5228a7c8a45ed16598ac7bb4dc2f09 Mon Sep 17 00:00:00 2001
From: Mike Hunhoff <mike.hunhoff@gmail.com>
Date: Fri, 19 Jul 2024 15:44:53 -0600
Subject: [PATCH 079/105] vmray: refactor model tests

---
 tests/test_vmray_model.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tests/test_vmray_model.py b/tests/test_vmray_model.py
index 79e234970..f19ae87b3 100644
--- a/tests/test_vmray_model.py
+++ b/tests/test_vmray_model.py
@@ -147,8 +147,10 @@ def test_vmray_model_pefile():
     )
 
     assert pefile.basic_info.image_base == 2863311530
+
     assert pefile.imports[0].dll == "KERNEL32.dll"
     assert pefile.imports[0].apis[0].address == 2863311530
     assert pefile.imports[0].apis[0].api.name == "Sleep"
+
     assert pefile.sections[0].name == ".text"
     assert pefile.sections[0].virtual_address == 2863311530

From 3b9496113332b4cc9c909a34ecc5a5dc9e1d7efb Mon Sep 17 00:00:00 2001
From: Mike Hunhoff <mike.hunhoff@gmail.com>
Date: Fri, 19 Jul 2024 15:50:07 -0600
Subject: [PATCH 080/105] vmray: complete pefile model tests

---
 tests/test_vmray_model.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/tests/test_vmray_model.py b/tests/test_vmray_model.py
index f19ae87b3..c693b6631 100644
--- a/tests/test_vmray_model.py
+++ b/tests/test_vmray_model.py
@@ -136,7 +136,7 @@ def test_vmray_model_pefile():
             "exports": [
                 {
                     "api": {
-                        "name": "HellWorld",
+                        "name": "HelloWorld",
                         "ordinal": 10
                     },
                     "address": 2863311530
@@ -154,3 +154,7 @@ def test_vmray_model_pefile():
 
     assert pefile.sections[0].name == ".text"
     assert pefile.sections[0].virtual_address == 2863311530
+
+    assert pefile.exports[0].address == 2863311530
+    assert pefile.exports[0].api.name == "HelloWorld"
+    assert pefile.exports[0].api.ordinal == 10

From 46b68d11b7495a941cf5e635ec5f653983694490 Mon Sep 17 00:00:00 2001
From: Mike Hunhoff <mike.hunhoff@gmail.com>
Date: Tue, 23 Jul 2024 09:48:52 -0600
Subject: [PATCH 081/105] vmray: improve models.py comments

---
 capa/features/extractors/vmray/models.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/capa/features/extractors/vmray/models.py b/capa/features/extractors/vmray/models.py
index 63ef3b273..8049d214c 100644
--- a/capa/features/extractors/vmray/models.py
+++ b/capa/features/extractors/vmray/models.py
@@ -106,13 +106,15 @@ class Params(BaseModel):
 
 def validate_call_name(value: str) -> str:
     if value.startswith("sys_"):
+        # VMRay appears to log kernel function calls ("sys_*") so we remove that
+        # here to enable capa matching
         return value[4:]
     else:
         return value
 
 
-# call names may contain uneeded data so we remove that data before
-# the inner validation (str) is called
+# function call names may need to be reformatted to remove data, etc. so we reformat
+# before calling the inner validation (str)
 CallName = Annotated[str, BeforeValidator(validate_call_name)]
 
 

From 31e53fab2048bfbb158d9805fd5eaca075e4d4a8 Mon Sep 17 00:00:00 2001
From: Mike Hunhoff <mike.hunhoff@gmail.com>
Date: Tue, 23 Jul 2024 09:52:36 -0600
Subject: [PATCH 082/105] vmray: improve models.py comments

---
 capa/features/extractors/vmray/models.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/capa/features/extractors/vmray/models.py b/capa/features/extractors/vmray/models.py
index 8049d214c..0f7e080b1 100644
--- a/capa/features/extractors/vmray/models.py
+++ b/capa/features/extractors/vmray/models.py
@@ -106,7 +106,7 @@ class Params(BaseModel):
 
 def validate_call_name(value: str) -> str:
     if value.startswith("sys_"):
-        # VMRay appears to log kernel function calls ("sys_*") so we remove that
+        # VMRay appears to log kernel function calls ("sys_*") for Linux so we remove that
         # here to enable capa matching
         return value[4:]
     else:

From f6d12bcb414f6eb26200670e14fa72437143abe8 Mon Sep 17 00:00:00 2001
From: Mike Hunhoff <mike.hunhoff@gmail.com>
Date: Wed, 24 Jul 2024 10:03:57 -0600
Subject: [PATCH 083/105] vmray: fix lints

---
 capa/helpers.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/capa/helpers.py b/capa/helpers.py
index fa8239c6e..3a8fee5e0 100644
--- a/capa/helpers.py
+++ b/capa/helpers.py
@@ -11,7 +11,7 @@
 import logging
 import contextlib
 import importlib.util
-from typing import Dict, Union, BinaryIO, Iterator, List, NoReturn
+from typing import Dict, List, Union, BinaryIO, Iterator, NoReturn
 from pathlib import Path
 from zipfile import ZipFile
 

From 85373a7ddb89b6ab5f973f774b7c6ec8e909e5b4 Mon Sep 17 00:00:00 2001
From: Mike Hunhoff <mike.hunhoff@gmail.com>
Date: Wed, 24 Jul 2024 10:09:22 -0600
Subject: [PATCH 084/105] cape: add explicit check for CAPE report format file
 extension

---
 capa/helpers.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/capa/helpers.py b/capa/helpers.py
index 3a8fee5e0..9de2cbd96 100644
--- a/capa/helpers.py
+++ b/capa/helpers.py
@@ -132,7 +132,7 @@ def get_format_from_report(sample: Path) -> str:
             if "logs/summary_v2.json" in namelist and "logs/flog.xml" in namelist:
                 # assume VMRay zipfile at a minimum has these files
                 return FORMAT_VMRAY
-    else:
+    elif sample.name.endswith(("json", "json_", "json.gz")):
         report = load_json_from_path(sample)
         if "CAPE" in report:
             return FORMAT_CAPE
@@ -140,6 +140,7 @@ def get_format_from_report(sample: Path) -> str:
             # CAPE report that's missing the "CAPE" key,
             # which is not going to be much use, but its correct.
             return FORMAT_CAPE
+            
     return FORMAT_UNKNOWN
 
 

From 6e146bb1265e7b26227e90b2f5a0d0e3391ee2bf Mon Sep 17 00:00:00 2001
From: Mike Hunhoff <mike.hunhoff@gmail.com>
Date: Wed, 24 Jul 2024 10:12:21 -0600
Subject: [PATCH 085/105] vmray: fix lints

---
 capa/helpers.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/capa/helpers.py b/capa/helpers.py
index 9de2cbd96..ac7b52f62 100644
--- a/capa/helpers.py
+++ b/capa/helpers.py
@@ -140,7 +140,7 @@ def get_format_from_report(sample: Path) -> str:
             # CAPE report that's missing the "CAPE" key,
             # which is not going to be much use, but its correct.
             return FORMAT_CAPE
-            
+
     return FORMAT_UNKNOWN
 
 

From 9a1364c21ca797d415abe70f9a36b6605bbd9f35 Mon Sep 17 00:00:00 2001
From: Mike Hunhoff <mike.hunhoff@gmail.com>
Date: Wed, 24 Jul 2024 10:32:22 -0600
Subject: [PATCH 086/105] vmray: document vmray support in README

---
 README.md | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/README.md b/README.md
index 882b5cb3d..391f51b41 100644
--- a/README.md
+++ b/README.md
@@ -126,12 +126,14 @@ function @ 0x4011C0
 ...
 ```
 
-Additionally, capa also supports analyzing sandbox reports for dynamic capability extraction.
-In order to use this, you first submit your sample to one of supported sandboxes for analysis, and then run capa against the generated report file.
+capa also supports dynamic capabilities detection for multiple sandboxes including:
+* [CAPE sandbox](https://github.com/kevoreilly/CAPEv2): `.json`, `.json_`, `.json.gz`
+* [DRAKVUF sandbox](https://github.com/CERT-Polska/drakvuf-sandbox/): `.log`, `.log.gz`
+* [VMRay sandbox](https://www.vmray.com/): `.zip`
 
-Currently, capa supports the [CAPE sandbox](https://github.com/kevoreilly/CAPEv2) and the [DRAKVUF sandbox](https://github.com/CERT-Polska/drakvuf-sandbox/). In order to use either, simply run capa against the generated file (JSON for CAPE or LOG for DRAKVUF sandbox) and it will automatically detect the sandbox and extract capabilities from it.
+To use this feature, submit your file to a supported sandbox and then download and run capa against the generated report file. This feature enables capa to match capabilities against dynamic and static features that the sandbox captured during execution.
 
-Here's an example of running capa against a packed binary, and then running capa against the CAPE report of that binary:
+Here's an example of running capa against a packed file, and then running capa against the CAPE report generated for the same packed file:
 
 ```yaml
 $ capa 05be49819139a3fdcdbddbdefd298398779521f3d68daa25275cc77508e42310.exe

From b8d3d77829f576819b0c325ea15652804e19a9ce Mon Sep 17 00:00:00 2001
From: Mike Hunhoff <mike.hunhoff@gmail.com>
Date: Wed, 24 Jul 2024 10:35:34 -0600
Subject: [PATCH 087/105] vmray: document vmray support in README

---
 README.md | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/README.md b/README.md
index 391f51b41..f3364c690 100644
--- a/README.md
+++ b/README.md
@@ -127,9 +127,9 @@ function @ 0x4011C0
 ```
 
 capa also supports dynamic capabilities detection for multiple sandboxes including:
-* [CAPE sandbox](https://github.com/kevoreilly/CAPEv2): `.json`, `.json_`, `.json.gz`
-* [DRAKVUF sandbox](https://github.com/CERT-Polska/drakvuf-sandbox/): `.log`, `.log.gz`
-* [VMRay sandbox](https://www.vmray.com/): `.zip`
+* [CAPE](https://github.com/kevoreilly/CAPEv2) (supported report formats: `.json`, `.json_`, `.json.gz`)
+* [DRAKVUF](https://github.com/CERT-Polska/drakvuf-sandbox/) (supported report formats: `.log`, `.log.gz`)
+* [VMRay](https://www.vmray.com/) (supported report formats: analysis archive `.zip`)
 
 To use this feature, submit your file to a supported sandbox and then download and run capa against the generated report file. This feature enables capa to match capabilities against dynamic and static features that the sandbox captured during execution.
 

From 5b7a2be652320cabc947cb2b3f65e15f735ef806 Mon Sep 17 00:00:00 2001
From: Mike Hunhoff <mike.hunhoff@gmail.com>
Date: Thu, 25 Jul 2024 09:33:17 -0600
Subject: [PATCH 088/105] vmray: remove outdated comments __init__.py

---
 capa/features/extractors/vmray/__init__.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/capa/features/extractors/vmray/__init__.py b/capa/features/extractors/vmray/__init__.py
index 5c27f83ad..34c5ab2d4 100644
--- a/capa/features/extractors/vmray/__init__.py
+++ b/capa/features/extractors/vmray/__init__.py
@@ -16,7 +16,6 @@
 
 logger = logging.getLogger(__name__)
 
-# TODO (meh): is default password "infected" good enough?? https://github.com/mandiant/capa/issues/2148
 DEFAULT_ARCHIVE_PASSWORD = b"infected"
 
 SUPPORTED_FLOG_VERSIONS = ("2",)

From 7b3812ae190c9a90439cf0973ab58d145f1e1997 Mon Sep 17 00:00:00 2001
From: Mike Hunhoff <mike.hunhoff@gmail.com>
Date: Thu, 25 Jul 2024 12:12:49 -0600
Subject: [PATCH 089/105] vmray: improve error reporting

---
 capa/features/extractors/vmray/__init__.py | 18 +++++-------------
 capa/helpers.py                            |  9 +++++++++
 capa/main.py                               |  7 +++++++
 3 files changed, 21 insertions(+), 13 deletions(-)

diff --git a/capa/features/extractors/vmray/__init__.py b/capa/features/extractors/vmray/__init__.py
index 34c5ab2d4..a1f90be19 100644
--- a/capa/features/extractors/vmray/__init__.py
+++ b/capa/features/extractors/vmray/__init__.py
@@ -38,9 +38,8 @@ def __init__(self, zipfile_path: Path):
         self.flog = Flog.model_validate(flog_dict)
 
         if self.flog.analysis.log_version not in SUPPORTED_FLOG_VERSIONS:
-            logger.warning("VMRay feature extractor does not support flog version %s", self.flog.analysis.log_version)
             raise UnsupportedFormatError(
-                "VMRay feature extractor does not support flog version %s", self.flog.analysis.log_version
+                "VMRay feature extractor does not support flog version %s" % self.flog.analysis.log_version
             )
 
         self.exports: Dict[int, str] = {}
@@ -58,20 +57,14 @@ def __init__(self, zipfile_path: Path):
         self._find_sample_file()
 
         if self.sample_file_name is None or self.sample_file_analysis is None:
-            logger.warning("VMRay archive does not contain sample file (file_type: %s)", self.file_type)
-            raise UnsupportedFormatError("VMRay archive does not contain sample file (file_type: %s)", self.file_type)
+            raise UnsupportedFormatError("VMRay archive does not contain sample file (file_type: %s)" % self.file_type)
 
         if not self.sample_file_static_data:
-            # we see this for text files e.g. JScript file types
-            logger.warning("VMRay archive does not contain static data (file_type: %s)", self.file_type)
-            raise UnsupportedFormatError("VMRay archive does not contain static data (file_type: %s)", self.file_type)
+            raise UnsupportedFormatError("VMRay archive does not contain static data (file_type: %s)" % self.file_type)
 
         if not self.sample_file_static_data.pe and not self.sample_file_static_data.elf:
-            logger.warning(
-                "VMRay feature extractor only supports PE and ELF at this time (file_type: %s)", self.file_type
-            )
             raise UnsupportedFormatError(
-                "VMRay feature extractor only supports PE and ELF at this time(file_type: %s)", self.file_type
+                "VMRay feature extractor only supports PE and ELF at this time (file_type: %s)" % self.file_type
             )
 
         # VMRay does not store static strings for the sample file so we must use the source file
@@ -79,11 +72,10 @@ def __init__(self, zipfile_path: Path):
         sample_sha256: str = self.sample_file_analysis.hash_values.sha256.lower()
         sample_file_path: str = f"internal/static_analyses/{sample_sha256}/objects/files/{sample_sha256}"
 
-        logger.debug("file_type: %s, file_path: %s", self.file_type, sample_file_path)
+        logger.debug("file_type: %s, file_path: %s" % self.file_type, sample_file_path)
 
         self.sample_file_buf: bytes = self.zipfile.read(sample_file_path, pwd=DEFAULT_ARCHIVE_PASSWORD)
 
-        # only compute these if we've found a supported sample file type
         self._compute_base_address()
         self._compute_imports()
         self._compute_exports()
diff --git a/capa/helpers.py b/capa/helpers.py
index ac7b52f62..ebfde6e37 100644
--- a/capa/helpers.py
+++ b/capa/helpers.py
@@ -249,6 +249,15 @@ def log_unsupported_drakvuf_report_error(error: str):
     logger.error("-" * 80)
 
 
+def log_unsupported_vmray_report_error(error: str):
+    logger.error("-" * 80)
+    logger.error(" Input file is not a valid VMRay analysis archive: %s", error)
+    logger.error(" ")
+    logger.error(" capa only supports analyzing VMRay dynamic analysis archives.")
+    logger.error(" Please make sure you have downloaded a dynamic analysis archive from VMRay.")
+    logger.error("-" * 80)
+
+
 def log_empty_sandbox_report_error(error: str, sandbox_name: str):
     logger.error("-" * 80)
     logger.error(" %s report is empty or only contains little useful data: %s", sandbox_name, error)
diff --git a/capa/main.py b/capa/main.py
index d6eab117c..076c68fc2 100644
--- a/capa/main.py
+++ b/capa/main.py
@@ -60,6 +60,7 @@
     log_unsupported_format_error,
     log_empty_sandbox_report_error,
     log_unsupported_cape_report_error,
+    log_unsupported_vmray_report_error,
     log_unsupported_drakvuf_report_error,
 )
 from capa.exceptions import (
@@ -656,6 +657,9 @@ def get_file_extractors_from_cli(args, input_format: str) -> List[FeatureExtract
             log_unsupported_cape_report_error(str(e))
         elif input_format == FORMAT_DRAKVUF:
             log_unsupported_drakvuf_report_error(str(e))
+        elif input_format == FORMAT_VMRAY:
+            log_unsupported_vmray_report_error(str(e))
+            print(e)
         else:
             log_unsupported_format_error()
         raise ShouldExitError(E_INVALID_FILE_TYPE) from e
@@ -773,6 +777,9 @@ def get_extractor_from_cli(args, input_format: str, backend: str) -> FeatureExtr
             log_unsupported_cape_report_error(str(e))
         elif input_format == FORMAT_DRAKVUF:
             log_unsupported_drakvuf_report_error(str(e))
+        elif input_format == FORMAT_VMRAY:
+            log_unsupported_vmray_report_error(str(e))
+            print(e)
         else:
             log_unsupported_format_error()
         raise ShouldExitError(E_INVALID_FILE_TYPE) from e

From 05fb8f658f8854951d71f91fe47130942d8aff11 Mon Sep 17 00:00:00 2001
From: Mike Hunhoff <mike.hunhoff@gmail.com>
Date: Thu, 25 Jul 2024 12:19:22 -0600
Subject: [PATCH 090/105] vmray: fix flake8 lints

---
 capa/features/extractors/vmray/__init__.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/capa/features/extractors/vmray/__init__.py b/capa/features/extractors/vmray/__init__.py
index a1f90be19..d59c3652e 100644
--- a/capa/features/extractors/vmray/__init__.py
+++ b/capa/features/extractors/vmray/__init__.py
@@ -39,7 +39,7 @@ def __init__(self, zipfile_path: Path):
 
         if self.flog.analysis.log_version not in SUPPORTED_FLOG_VERSIONS:
             raise UnsupportedFormatError(
-                "VMRay feature extractor does not support flog version %s" % self.flog.analysis.log_version
+                "VMRay feature extractor does not support flog version %s", self.flog.analysis.log_version
             )
 
         self.exports: Dict[int, str] = {}
@@ -57,14 +57,14 @@ def __init__(self, zipfile_path: Path):
         self._find_sample_file()
 
         if self.sample_file_name is None or self.sample_file_analysis is None:
-            raise UnsupportedFormatError("VMRay archive does not contain sample file (file_type: %s)" % self.file_type)
+            raise UnsupportedFormatError("VMRay archive does not contain sample file (file_type: %s)", self.file_type)
 
         if not self.sample_file_static_data:
-            raise UnsupportedFormatError("VMRay archive does not contain static data (file_type: %s)" % self.file_type)
+            raise UnsupportedFormatError("VMRay archive does not contain static data (file_type: %s)", self.file_type)
 
         if not self.sample_file_static_data.pe and not self.sample_file_static_data.elf:
             raise UnsupportedFormatError(
-                "VMRay feature extractor only supports PE and ELF at this time (file_type: %s)" % self.file_type
+                "VMRay feature extractor only supports PE and ELF at this time (file_type: %s)", self.file_type
             )
 
         # VMRay does not store static strings for the sample file so we must use the source file
@@ -72,7 +72,7 @@ def __init__(self, zipfile_path: Path):
         sample_sha256: str = self.sample_file_analysis.hash_values.sha256.lower()
         sample_file_path: str = f"internal/static_analyses/{sample_sha256}/objects/files/{sample_sha256}"
 
-        logger.debug("file_type: %s, file_path: %s" % self.file_type, sample_file_path)
+        logger.debug("file_type: %s, file_path: %s", self.file_type, sample_file_path)
 
         self.sample_file_buf: bytes = self.zipfile.read(sample_file_path, pwd=DEFAULT_ARCHIVE_PASSWORD)
 

From b967213302a1225c0fbd870d932cc5438ca6b572 Mon Sep 17 00:00:00 2001
From: Mike Hunhoff <mike.hunhoff@gmail.com>
Date: Thu, 25 Jul 2024 12:30:20 -0600
Subject: [PATCH 091/105] vmray: improve comments __init__.py

---
 capa/features/extractors/vmray/__init__.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/capa/features/extractors/vmray/__init__.py b/capa/features/extractors/vmray/__init__.py
index d59c3652e..95f1f3e09 100644
--- a/capa/features/extractors/vmray/__init__.py
+++ b/capa/features/extractors/vmray/__init__.py
@@ -92,6 +92,8 @@ def _find_sample_file(self):
                 self.sample_file_analysis = file_analysis
 
                 if file_analysis.ref_static_data:
+                    # like "path": ["static_data","static_data_0"] where "static_data_0" is the summary_v2 static data
+                    # key for the file's static data
                     self.sample_file_static_data = self.sv2.static_data[file_analysis.ref_static_data.path[1]]
 
                 break

From 51b853de597c6743a2cf655da73806e77ebe5864 Mon Sep 17 00:00:00 2001
From: Mike Hunhoff <mike.hunhoff@gmail.com>
Date: Mon, 29 Jul 2024 11:39:03 -0600
Subject: [PATCH 092/105] vmray: remove bad print statements

---
 capa/main.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/capa/main.py b/capa/main.py
index 076c68fc2..e81bf7086 100644
--- a/capa/main.py
+++ b/capa/main.py
@@ -659,7 +659,6 @@ def get_file_extractors_from_cli(args, input_format: str) -> List[FeatureExtract
             log_unsupported_drakvuf_report_error(str(e))
         elif input_format == FORMAT_VMRAY:
             log_unsupported_vmray_report_error(str(e))
-            print(e)
         else:
             log_unsupported_format_error()
         raise ShouldExitError(E_INVALID_FILE_TYPE) from e
@@ -779,7 +778,6 @@ def get_extractor_from_cli(args, input_format: str, backend: str) -> FeatureExtr
             log_unsupported_drakvuf_report_error(str(e))
         elif input_format == FORMAT_VMRAY:
             log_unsupported_vmray_report_error(str(e))
-            print(e)
         else:
             log_unsupported_format_error()
         raise ShouldExitError(E_INVALID_FILE_TYPE) from e

From 1a3cf4aa8eda7f45b0cc52145869a74499df5697 Mon Sep 17 00:00:00 2001
From: Mike Hunhoff <mike.hunhoff@gmail.com>
Date: Mon, 29 Jul 2024 11:41:31 -0600
Subject: [PATCH 093/105] vmray: update extractor.py format_params

---
 capa/features/extractors/vmray/extractor.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/capa/features/extractors/vmray/extractor.py b/capa/features/extractors/vmray/extractor.py
index a8f3a79a4..735c646b9 100644
--- a/capa/features/extractors/vmray/extractor.py
+++ b/capa/features/extractors/vmray/extractor.py
@@ -27,7 +27,7 @@
 )
 
 
-def format_params(params: ParamList) -> List[str]:
+def get_formatted_params(params: ParamList) -> List[str]:
     params_list: List[str] = []
 
     for param in params:
@@ -107,13 +107,13 @@ def get_call_name(self, ph, th, ch) -> str:
 
         # format input parameters
         if call.params_in:
-            call_formatted += f"({', '.join(format_params(call.params_in.params))})"
+            call_formatted += f"({', '.join(get_formatted_params(call.params_in.params))})"
         else:
             call_formatted += "()"
 
         # format output parameters
         if call.params_out:
-            call_formatted += f" -> {', '.join(format_params(call.params_out.params))}"
+            call_formatted += f" -> {', '.join(get_formatted_params(call.params_out.params))}"
 
         return call_formatted
 

From 8cba23bbce87cdbec6f11cbed1e1b747048bada9 Mon Sep 17 00:00:00 2001
From: Mike Hunhoff <mike.hunhoff@gmail.com>
Date: Mon, 29 Jul 2024 11:49:04 -0600
Subject: [PATCH 094/105] vmray: improve extract_import_names

---
 capa/features/extractors/vmray/file.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/capa/features/extractors/vmray/file.py b/capa/features/extractors/vmray/file.py
index c84f0190a..38ac9db01 100644
--- a/capa/features/extractors/vmray/file.py
+++ b/capa/features/extractors/vmray/file.py
@@ -43,8 +43,8 @@ def extract_export_names(analysis: VMRayAnalysis) -> Iterator[Tuple[Feature, Add
 
 
 def extract_import_names(analysis: VMRayAnalysis) -> Iterator[Tuple[Feature, Address]]:
-    for addr, name in analysis.imports.items():
-        for symbol in generate_symbols(name[0], name[1], include_dll=True):
+    for addr, (module, api) in analysis.imports.items():
+        for symbol in generate_symbols(module, api, include_dll=True):
             yield Import(symbol), AbsoluteVirtualAddress(addr)
 
 

From 87dfa50996b00ef52a362516e3383b9ffe8f79d2 Mon Sep 17 00:00:00 2001
From: Mike Hunhoff <mike.hunhoff@gmail.com>
Date: Mon, 29 Jul 2024 12:00:29 -0600
Subject: [PATCH 095/105] scripts: remove old code from show-features.py

---
 scripts/show-features.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/scripts/show-features.py b/scripts/show-features.py
index f624eac4f..6005a810c 100644
--- a/scripts/show-features.py
+++ b/scripts/show-features.py
@@ -258,9 +258,6 @@ def print_dynamic_features(processes, extractor: DynamicFeatureExtractor):
                     if isinstance(feature, (Number, String)):
                         arguments.append(str(feature.value))
 
-                # if not apis:
-                #    print(f"    arguments=[{', '.join(arguments)}]")
-
                 for cid, api in apis:
                     print(f"    call {cid}: {api}({', '.join(arguments)})")
 

From 7bf0b396eeb3a60c24840fda3720e39e2884ed91 Mon Sep 17 00:00:00 2001
From: Mike Hunhoff <mike.hunhoff@gmail.com>
Date: Mon, 29 Jul 2024 12:02:14 -0600
Subject: [PATCH 096/105] core: improve error message for vmray

---
 capa/helpers.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/capa/helpers.py b/capa/helpers.py
index ebfde6e37..c0e5c5dcf 100644
--- a/capa/helpers.py
+++ b/capa/helpers.py
@@ -253,7 +253,9 @@ def log_unsupported_vmray_report_error(error: str):
     logger.error("-" * 80)
     logger.error(" Input file is not a valid VMRay analysis archive: %s", error)
     logger.error(" ")
-    logger.error(" capa only supports analyzing VMRay dynamic analysis archives.")
+    logger.error(
+        " capa only supports analyzing VMRay dynamic analysis archives containing summary_v2.json and flog.xml log files."
+    )
     logger.error(" Please make sure you have downloaded a dynamic analysis archive from VMRay.")
     logger.error("-" * 80)
 

From 139dcc430c9c947bca118d87c740fca95a7ccc95 Mon Sep 17 00:00:00 2001
From: Mike Hunhoff <mike.hunhoff@gmail.com>
Date: Mon, 29 Jul 2024 12:16:05 -0600
Subject: [PATCH 097/105] vmray: improve logging

---
 capa/features/extractors/vmray/__init__.py | 8 ++++----
 capa/features/extractors/vmray/global_.py  | 9 +++------
 2 files changed, 7 insertions(+), 10 deletions(-)

diff --git a/capa/features/extractors/vmray/__init__.py b/capa/features/extractors/vmray/__init__.py
index 95f1f3e09..88d2a8e28 100644
--- a/capa/features/extractors/vmray/__init__.py
+++ b/capa/features/extractors/vmray/__init__.py
@@ -39,7 +39,7 @@ def __init__(self, zipfile_path: Path):
 
         if self.flog.analysis.log_version not in SUPPORTED_FLOG_VERSIONS:
             raise UnsupportedFormatError(
-                "VMRay feature extractor does not support flog version %s", self.flog.analysis.log_version
+                "VMRay feature extractor does not support flog version %s" % self.flog.analysis.log_version
             )
 
         self.exports: Dict[int, str] = {}
@@ -57,14 +57,14 @@ def __init__(self, zipfile_path: Path):
         self._find_sample_file()
 
         if self.sample_file_name is None or self.sample_file_analysis is None:
-            raise UnsupportedFormatError("VMRay archive does not contain sample file (file_type: %s)", self.file_type)
+            raise UnsupportedFormatError("VMRay archive does not contain sample file (file_type: %s)" % self.file_type)
 
         if not self.sample_file_static_data:
-            raise UnsupportedFormatError("VMRay archive does not contain static data (file_type: %s)", self.file_type)
+            raise UnsupportedFormatError("VMRay archive does not contain static data (file_type: %s)" % self.file_type)
 
         if not self.sample_file_static_data.pe and not self.sample_file_static_data.elf:
             raise UnsupportedFormatError(
-                "VMRay feature extractor only supports PE and ELF at this time (file_type: %s)", self.file_type
+                "VMRay feature extractor only supports PE and ELF at this time (file_type: %s)" % self.file_type
             )
 
         # VMRay does not store static strings for the sample file so we must use the source file
diff --git a/capa/features/extractors/vmray/global_.py b/capa/features/extractors/vmray/global_.py
index 95f7cc90d..a42ce511e 100644
--- a/capa/features/extractors/vmray/global_.py
+++ b/capa/features/extractors/vmray/global_.py
@@ -35,8 +35,7 @@ def extract_arch(analysis: VMRayAnalysis) -> Iterator[Tuple[Feature, Address]]:
     elif "x86-64" in file_type:
         yield Arch(ARCH_AMD64), NO_ADDRESS
     else:
-        logger.warning("unrecognized arch: %s", file_type)
-        raise ValueError(f"unrecognized arch from the VMRay report: {file_type}")
+        raise ValueError("unrecognized arch from the VMRay report: %s" % file_type)
 
 
 def extract_format(analysis: VMRayAnalysis) -> Iterator[Tuple[Feature, Address]]:
@@ -46,8 +45,7 @@ def extract_format(analysis: VMRayAnalysis) -> Iterator[Tuple[Feature, Address]]
     elif analysis.sample_file_static_data.elf:
         yield Format(FORMAT_ELF), NO_ADDRESS
     else:
-        logger.warning("unrecognized file format: %s", analysis.file_type)
-        raise ValueError(f"unrecognized file format from the VMRay report: {analysis.file_type}")
+        raise ValueError("unrecognized file format from the VMRay report: %s" % analysis.file_type)
 
 
 def extract_os(analysis: VMRayAnalysis) -> Iterator[Tuple[Feature, Address]]:
@@ -58,8 +56,7 @@ def extract_os(analysis: VMRayAnalysis) -> Iterator[Tuple[Feature, Address]]:
     elif "linux" in file_type.lower():
         yield OS(OS_LINUX), NO_ADDRESS
     else:
-        logger.warning("unrecognized OS: %s", file_type)
-        raise ValueError(f"unrecognized OS from the VMRay report: {file_type}")
+        raise ValueError("unrecognized OS from the VMRay report: %s" % file_type)
 
 
 def extract_features(analysis: VMRayAnalysis) -> Iterator[Tuple[Feature, Address]]:

From 71c515d4d789ac5c521c1cc32b96d5f14025e59c Mon Sep 17 00:00:00 2001
From: Mike Hunhoff <mike.hunhoff@gmail.com>
Date: Mon, 29 Jul 2024 12:19:53 -0600
Subject: [PATCH 098/105] vmray: improve comments __init__.py

---
 capa/features/extractors/vmray/__init__.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/capa/features/extractors/vmray/__init__.py b/capa/features/extractors/vmray/__init__.py
index 88d2a8e28..06d581cc9 100644
--- a/capa/features/extractors/vmray/__init__.py
+++ b/capa/features/extractors/vmray/__init__.py
@@ -56,6 +56,9 @@ def __init__(self, zipfile_path: Path):
 
         self._find_sample_file()
 
+        # VMRay analysis archives in various shapes and sizes and file type does not definitively tell us what data
+        # we can expect to find in the archive, so to be explicit we check for the various pieces that we need at
+        # minimum to run capa analysis
         if self.sample_file_name is None or self.sample_file_analysis is None:
             raise UnsupportedFormatError("VMRay archive does not contain sample file (file_type: %s)" % self.file_type)
 

From a8d849e8722fb06470e4a82a6c9677cec811a75b Mon Sep 17 00:00:00 2001
From: Mike Hunhoff <mike.hunhoff@gmail.com>
Date: Tue, 30 Jul 2024 11:43:53 -0600
Subject: [PATCH 099/105] vmray: improve comments models.py

---
 capa/features/extractors/vmray/models.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/capa/features/extractors/vmray/models.py b/capa/features/extractors/vmray/models.py
index 0f7e080b1..a599dc420 100644
--- a/capa/features/extractors/vmray/models.py
+++ b/capa/features/extractors/vmray/models.py
@@ -74,7 +74,7 @@ def validate_hex_int(value: Union[str, int]) -> int:
 HexInt = Annotated[int, BeforeValidator(validate_hex_int)]
 
 
-# models flog.xml file
+# models flog.xml file, certain fields left as comments for documentation purposes
 class ParamDeref(BaseModel):
     type_: str = Field(alias="type")
     value: Optional[str] = None
@@ -150,7 +150,7 @@ class Flog(BaseModel):
     analysis: Analysis
 
 
-# models for summary_v2.json file
+# models for summary_v2.json file, certain fields left as comments for documentation purposes
 class GenericReference(BaseModel):
     path: List[str]
     source: str

From 3982356945ad41a791c6218966083bad4bdd847b Mon Sep 17 00:00:00 2001
From: mr-tz <moritz.raabe@mandiant.com>
Date: Wed, 31 Jul 2024 12:59:16 +0000
Subject: [PATCH 100/105] load gzipped rd, see capa-testfiles#245

---
 capa/render/result_document.py | 5 +++--
 tests/fixtures.py              | 5 ++++-
 2 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/capa/render/result_document.py b/capa/render/result_document.py
index 975e37431..ab6b03979 100644
--- a/capa/render/result_document.py
+++ b/capa/render/result_document.py
@@ -22,7 +22,7 @@
 import capa.features.freeze.features as frzf
 from capa.rules import RuleSet
 from capa.engine import MatchResults
-from capa.helpers import assert_never
+from capa.helpers import assert_never, load_json_from_path
 
 
 class FrozenModel(BaseModel):
@@ -668,4 +668,5 @@ def to_capa(self) -> Tuple[Metadata, Dict]:
 
     @classmethod
     def from_file(cls, path: Path) -> "ResultDocument":
-        return cls.model_validate_json(path.read_text(encoding="utf-8"))
+        report = load_json_from_path(path)
+        return cls.model_validate(report)
diff --git a/tests/fixtures.py b/tests/fixtures.py
index 286eaaef8..3ae05f503 100644
--- a/tests/fixtures.py
+++ b/tests/fixtures.py
@@ -1537,4 +1537,7 @@ def a076114_rd():
 @pytest.fixture
 def dynamic_a0000a6_rd():
     # python -m capa.main tests/data/dynamic/cape/v2.2/0000a65749f5902c4d82ffa701198038f0b4870b00a27cfca109f8f933476d82.json --json > tests/data/rd/0000a65749f5902c4d82ffa701198038f0b4870b00a27cfca109f8f933476d82.json
-    return get_result_doc(CD / "data" / "rd" / "0000a65749f5902c4d82ffa701198038f0b4870b00a27cfca109f8f933476d82.json")
+    # gzip tests/data/rd/0000a65749f5902c4d82ffa701198038f0b4870b00a27cfca109f8f933476d82.json
+    return get_result_doc(
+        CD / "data" / "rd" / "0000a65749f5902c4d82ffa701198038f0b4870b00a27cfca109f8f933476d82.json.gz"
+    )

From e83f289c8eac785669ed7b63dfe6b4cc05b5de5e Mon Sep 17 00:00:00 2001
From: mr-tz <moritz.raabe@mandiant.com>
Date: Wed, 31 Jul 2024 13:28:41 +0000
Subject: [PATCH 101/105] add script to minimize vmray archive to only relevant
 files

---
 scripts/minimize_vmray_results.py | 63 +++++++++++++++++++++++++++++++
 1 file changed, 63 insertions(+)
 create mode 100644 scripts/minimize_vmray_results.py

diff --git a/scripts/minimize_vmray_results.py b/scripts/minimize_vmray_results.py
new file mode 100644
index 000000000..e36688bbb
--- /dev/null
+++ b/scripts/minimize_vmray_results.py
@@ -0,0 +1,63 @@
+#!/usr/bin/env python
+"""
+Copyright (C) 2024 Mandiant, Inc. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+You may obtain a copy of the License at: [package root]/LICENSE.txt
+Unless required by applicable law or agreed to in writing, software distributed under the License
+ is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and limitations under the License.
+
+Extract files relevant to capa analysis from VMRay Analysis Archive and create a new ZIP file.
+"""
+import sys
+import logging
+import zipfile
+import argparse
+from pathlib import Path
+
+from capa.features.extractors.vmray import DEFAULT_ARCHIVE_PASSWORD, VMRayAnalysis
+
+logger = logging.getLogger(__name__)
+
+
+def main(argv=None):
+    if argv is None:
+        argv = sys.argv[1:]
+
+    parser = argparse.ArgumentParser(
+        description="Minimize VMRay Analysis Archive to ZIP file only containing relevant files"
+    )
+    parser.add_argument(
+        "analysis_archive",
+        type=Path,
+        help="path to VMRay Analysis Archive downloaded from Dynamic Analysis Report page",
+    )
+    parser.add_argument(
+        "-p", "--password", type=str, default="infected", help="password used to unzip and zip protected archives"
+    )
+    args = parser.parse_args(args=argv)
+
+    analysis_archive = args.analysis_archive
+
+    vmra = VMRayAnalysis(analysis_archive)
+    sv2_json = vmra.zipfile.read("logs/summary_v2.json", pwd=DEFAULT_ARCHIVE_PASSWORD)
+    flog_xml = vmra.zipfile.read("logs/flog.xml", pwd=DEFAULT_ARCHIVE_PASSWORD)
+    sample_file_buf = vmra.sample_file_buf
+    sample_sha256: str = vmra.sample_file_analysis.hash_values.sha256.lower()
+
+    new_zip_name = f"{analysis_archive.parent / analysis_archive.stem}_min.zip"
+    with zipfile.ZipFile(new_zip_name, "w") as new_zip:
+        new_zip.writestr("logs/summary_v2.json", sv2_json)
+        new_zip.writestr("logs/flog.xml", flog_xml)
+        new_zip.writestr(f"internal/static_analyses/{sample_sha256}/objects/files/{sample_sha256}", sample_file_buf)
+        new_zip.setpassword(args.password.encode("ascii"))
+
+    # ensure capa loads the minimized archive
+    assert isinstance(VMRayAnalysis(Path(new_zip_name)), VMRayAnalysis)
+
+    print(f"Created minimized VMRay archive '{new_zip_name}' with password '{args.password}'.")
+
+
+if __name__ == "__main__":
+    sys.exit(main())

From e47635455ef2864b3455e18dd0239f0a83e75045 Mon Sep 17 00:00:00 2001
From: mr-tz <moritz.raabe@mandiant.com>
Date: Wed, 31 Jul 2024 13:30:30 +0000
Subject: [PATCH 102/105] add dynamic vmray feature tests

---
 tests/fixtures.py              | 17 ++++++-
 tests/test_drakvuf_features.py | 45 +++++++++--------
 tests/test_vmray_features.py   | 89 ++++++++++++++++++++++++++++++++++
 3 files changed, 130 insertions(+), 21 deletions(-)
 create mode 100644 tests/test_vmray_features.py

diff --git a/tests/fixtures.py b/tests/fixtures.py
index 3ae05f503..ddd6e87ed 100644
--- a/tests/fixtures.py
+++ b/tests/fixtures.py
@@ -209,6 +209,13 @@ def get_drakvuf_extractor(path):
     return DrakvufExtractor.from_report(report)
 
 
+@lru_cache(maxsize=1)
+def get_vmray_extractor(path):
+    from capa.features.extractors.vmray.extractor import VMRayExtractor
+
+    return VMRayExtractor.from_zipfile(path)
+
+
 @lru_cache(maxsize=1)
 def get_ghidra_extractor(path: Path):
     import capa.features.extractors.ghidra.extractor
@@ -395,7 +402,7 @@ def get_data_path_by_name(name) -> Path:
             / "v2.2"
             / "d46900384c78863420fb3e297d0a2f743cd2b6b3f7f82bf64059a168e07aceb7.json.gz"
         )
-    elif name.startswith("93b2d1"):
+    elif name.startswith("93b2d1-drakvuf"):
         return (
             CD
             / "data"
@@ -403,6 +410,14 @@ def get_data_path_by_name(name) -> Path:
             / "drakvuf"
             / "93b2d1840566f45fab674ebc79a9d19c88993bcb645e0357f3cb584d16e7c795.log.gz"
         )
+    elif name.startswith("93b2d1-vmray"):
+        return (
+            CD
+            / "data"
+            / "dynamic"
+            / "vmray"
+            / "93b2d1840566f45fab674ebc79a9d19c88993bcb645e0357f3cb584d16e7c795_archive.zip"
+        )
     elif name.startswith("ea2876"):
         return CD / "data" / "ea2876e9175410b6f6719f80ee44b9553960758c7d0f7bed73c0fe9a78d8e669.dll_"
     elif name.startswith("1038a2"):
diff --git a/tests/test_drakvuf_features.py b/tests/test_drakvuf_features.py
index 79832fc34..61fe69442 100644
--- a/tests/test_drakvuf_features.py
+++ b/tests/test_drakvuf_features.py
@@ -15,26 +15,31 @@
 
 DYNAMIC_DRAKVUF_FEATURE_PRESENCE_TESTS = sorted(
     [
-        ("93b2d1", "file", capa.features.common.String("\\Program Files\\WindowsApps\\does_not_exist"), False),
+        ("93b2d1-drakvuf", "file", capa.features.common.String("\\Program Files\\WindowsApps\\does_not_exist"), False),
         # file/imports
-        ("93b2d1", "file", capa.features.file.Import("SetUnhandledExceptionFilter"), True),
+        ("93b2d1-drakvuf", "file", capa.features.file.Import("SetUnhandledExceptionFilter"), True),
         # thread/api calls
-        ("93b2d1", "process=(3564:4852),thread=6592", capa.features.insn.API("LdrLoadDll"), True),
-        ("93b2d1", "process=(3564:4852),thread=6592", capa.features.insn.API("DoesNotExist"), False),
+        ("93b2d1-drakvuf", "process=(3564:4852),thread=6592", capa.features.insn.API("LdrLoadDll"), True),
+        ("93b2d1-drakvuf", "process=(3564:4852),thread=6592", capa.features.insn.API("DoesNotExist"), False),
         # call/api
-        ("93b2d1", "process=(3564:4852),thread=6592,call=1", capa.features.insn.API("LdrLoadDll"), True),
-        ("93b2d1", "process=(3564:4852),thread=6592,call=1", capa.features.insn.API("DoesNotExist"), False),
+        ("93b2d1-drakvuf", "process=(3564:4852),thread=6592,call=1", capa.features.insn.API("LdrLoadDll"), True),
+        ("93b2d1-drakvuf", "process=(3564:4852),thread=6592,call=1", capa.features.insn.API("DoesNotExist"), False),
         # call/string argument
         (
-            "93b2d1",
+            "93b2d1-drakvuf",
             "process=(3564:4852),thread=6592,call=1",
             capa.features.common.String('0x667e2beb40:"api-ms-win-core-fibers-l1-1-1"'),
             True,
         ),
-        ("93b2d1", "process=(3564:4852),thread=6592,call=1", capa.features.common.String("non_existant"), False),
+        (
+            "93b2d1-drakvuf",
+            "process=(3564:4852),thread=6592,call=1",
+            capa.features.common.String("non_existant"),
+            False,
+        ),
         # call/number argument
-        ("93b2d1", "process=(3564:4852),thread=6592,call=1", capa.features.insn.Number(0x801), True),
-        ("93b2d1", "process=(3564:4852),thread=6592,call=1", capa.features.insn.Number(0x010101010101), False),
+        ("93b2d1-drakvuf", "process=(3564:4852),thread=6592,call=1", capa.features.insn.Number(0x801), True),
+        ("93b2d1-drakvuf", "process=(3564:4852),thread=6592,call=1", capa.features.insn.Number(0x010101010101), False),
     ],
     # order tests by (file, item)
     # so that our LRU cache is most effective.
@@ -43,26 +48,26 @@
 
 DYNAMIC_DRAKVUF_FEATURE_COUNT_TESTS = sorted(
     [
-        ("93b2d1", "file", capa.features.common.String("\\Program Files\\WindowsApps\\does_not_exist"), False),
+        ("93b2d1-drakvuf", "file", capa.features.common.String("\\Program Files\\WindowsApps\\does_not_exist"), False),
         # file/imports
-        ("93b2d1", "file", capa.features.file.Import("SetUnhandledExceptionFilter"), 1),
+        ("93b2d1-drakvuf", "file", capa.features.file.Import("SetUnhandledExceptionFilter"), 1),
         # thread/api calls
-        ("93b2d1", "process=(3564:4852),thread=6592", capa.features.insn.API("LdrLoadDll"), 9),
-        ("93b2d1", "process=(3564:4852),thread=6592", capa.features.insn.API("DoesNotExist"), False),
+        ("93b2d1-drakvuf", "process=(3564:4852),thread=6592", capa.features.insn.API("LdrLoadDll"), 9),
+        ("93b2d1-drakvuf", "process=(3564:4852),thread=6592", capa.features.insn.API("DoesNotExist"), False),
         # call/api
-        ("93b2d1", "process=(3564:4852),thread=6592,call=1", capa.features.insn.API("LdrLoadDll"), 1),
-        ("93b2d1", "process=(3564:4852),thread=6592,call=1", capa.features.insn.API("DoesNotExist"), 0),
+        ("93b2d1-drakvuf", "process=(3564:4852),thread=6592,call=1", capa.features.insn.API("LdrLoadDll"), 1),
+        ("93b2d1-drakvuf", "process=(3564:4852),thread=6592,call=1", capa.features.insn.API("DoesNotExist"), 0),
         # call/string argument
         (
-            "93b2d1",
+            "93b2d1-drakvuf",
             "process=(3564:4852),thread=6592,call=1",
             capa.features.common.String('0x667e2beb40:"api-ms-win-core-fibers-l1-1-1"'),
             1,
         ),
-        ("93b2d1", "process=(3564:4852),thread=6592,call=1", capa.features.common.String("non_existant"), 0),
+        ("93b2d1-drakvuf", "process=(3564:4852),thread=6592,call=1", capa.features.common.String("non_existant"), 0),
         # call/number argument
-        ("93b2d1", "process=(3564:4852),thread=6592,call=1", capa.features.insn.Number(0x801), 1),
-        ("93b2d1", "process=(3564:4852),thread=6592,call=1", capa.features.insn.Number(0x010101010101), 0),
+        ("93b2d1-drakvuf", "process=(3564:4852),thread=6592,call=1", capa.features.insn.Number(0x801), 1),
+        ("93b2d1-drakvuf", "process=(3564:4852),thread=6592,call=1", capa.features.insn.Number(0x010101010101), 0),
     ],
     # order tests by (file, item)
     # so that our LRU cache is most effective.
diff --git a/tests/test_vmray_features.py b/tests/test_vmray_features.py
new file mode 100644
index 000000000..d92a75e49
--- /dev/null
+++ b/tests/test_vmray_features.py
@@ -0,0 +1,89 @@
+# Copyright (C) 2024 Mandiant, Inc. All Rights Reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at: [package root]/LICENSE.txt
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+#  is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and limitations under the License.
+
+import fixtures
+
+import capa.main
+import capa.features.file
+import capa.features.insn
+import capa.features.common
+
+DYNAMIC_VMRAY_FEATURE_PRESENCE_TESTS = sorted(
+    [
+        ("93b2d1-vmray", "file", capa.features.common.String("api.%x%x.%s"), True),
+        ("93b2d1-vmray", "file", capa.features.common.String("\\Program Files\\WindowsApps\\does_not_exist"), False),
+        # file/imports
+        ("93b2d1-vmray", "file", capa.features.file.Import("GetAddrInfoW"), True),
+        # thread/api calls
+        ("93b2d1-vmray", "process=(2176:0),thread=7", capa.features.insn.API("GetAddrInfoW"), True),
+        ("93b2d1-vmray", "process=(2176:0),thread=7", capa.features.insn.API("DoesNotExist"), False),
+        # call/api
+        ("93b2d1-vmray", "process=(2176:0),thread=7,call=2361", capa.features.insn.API("GetAddrInfoW"), True),
+        # call/string argument
+        (
+            "93b2d1-vmray",
+            "process=(2176:0),thread=7,call=10323",
+            capa.features.common.String("raw.githubusercontent.com"),
+            True,
+        ),
+        # call/number argument
+        # VirtualAlloc(4096, 4)
+        ("93b2d1-vmray", "process=(2176:0),thread=7,call=2358", capa.features.insn.Number(4096), True),
+        ("93b2d1-vmray", "process=(2176:0),thread=7,call=2358", capa.features.insn.Number(4), True),
+    ],
+    # order tests by (file, item)
+    # so that our LRU cache is most effective.
+    key=lambda t: (t[0], t[1]),
+)
+
+DYNAMIC_VMRAY_FEATURE_COUNT_TESTS = sorted(
+    [
+        # file/imports
+        ("93b2d1-vmray", "file", capa.features.file.Import("GetAddrInfoW"), 1),
+        # thread/api calls
+        ("93b2d1-vmray", "process=(2176:0),thread=7", capa.features.insn.API("free"), 1),
+        ("93b2d1-vmray", "process=(2176:0),thread=7", capa.features.insn.API("GetAddrInfoW"), 5),
+        # call/api
+        ("93b2d1-vmray", "process=(2176:0),thread=7,call=2345", capa.features.insn.API("free"), 1),
+        ("93b2d1-vmray", "process=(2176:0),thread=7,call=2345", capa.features.insn.API("GetAddrInfoW"), 0),
+        ("93b2d1-vmray", "process=(2176:0),thread=7,call=2361", capa.features.insn.API("GetAddrInfoW"), 1),
+        # call/string argument
+        (
+            "93b2d1-vmray",
+            "process=(2176:0),thread=7,call=10323",
+            capa.features.common.String("raw.githubusercontent.com"),
+            1,
+        ),
+        ("93b2d1-vmray", "process=(2176:0),thread=7,call=10323", capa.features.common.String("non_existant"), 0),
+        # call/number argument
+        ("93b2d1-vmray", "process=(2176:0),thread=7,call=10315", capa.features.insn.Number(4096), 1),
+        ("93b2d1-vmray", "process=(2176:0),thread=7,call=10315", capa.features.insn.Number(4), 1),
+        ("93b2d1-vmray", "process=(2176:0),thread=7,call=10315", capa.features.insn.Number(404), 0),
+    ],
+    # order tests by (file, item)
+    # so that our LRU cache is most effective.
+    key=lambda t: (t[0], t[1]),
+)
+
+
+@fixtures.parametrize(
+    "sample,scope,feature,expected",
+    DYNAMIC_VMRAY_FEATURE_PRESENCE_TESTS,
+    indirect=["sample", "scope"],
+)
+def test_vmray_features(sample, scope, feature, expected):
+    fixtures.do_test_feature_presence(fixtures.get_vmray_extractor, sample, scope, feature, expected)
+
+
+@fixtures.parametrize(
+    "sample,scope,feature,expected",
+    DYNAMIC_VMRAY_FEATURE_COUNT_TESTS,
+    indirect=["sample", "scope"],
+)
+def test_vmray_feature_counts(sample, scope, feature, expected):
+    fixtures.do_test_feature_count(fixtures.get_vmray_extractor, sample, scope, feature, expected)

From afb72867f4d46bcd89fd3d3c1a22966fc081bb5f Mon Sep 17 00:00:00 2001
From: Moritz <mr-tz@users.noreply.github.com>
Date: Thu, 1 Aug 2024 07:58:29 +0200
Subject: [PATCH 103/105] assert sample analysis data is present

---
 scripts/minimize_vmray_results.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/scripts/minimize_vmray_results.py b/scripts/minimize_vmray_results.py
index e36688bbb..15ab81c26 100644
--- a/scripts/minimize_vmray_results.py
+++ b/scripts/minimize_vmray_results.py
@@ -44,6 +44,7 @@ def main(argv=None):
     sv2_json = vmra.zipfile.read("logs/summary_v2.json", pwd=DEFAULT_ARCHIVE_PASSWORD)
     flog_xml = vmra.zipfile.read("logs/flog.xml", pwd=DEFAULT_ARCHIVE_PASSWORD)
     sample_file_buf = vmra.sample_file_buf
+    assert vmra.sample_file_analysis is not None
     sample_sha256: str = vmra.sample_file_analysis.hash_values.sha256.lower()
 
     new_zip_name = f"{analysis_archive.parent / analysis_archive.stem}_min.zip"

From e8550f242cfc20c3700886f2d8b9bcee69f5868e Mon Sep 17 00:00:00 2001
From: mr-tz <moritz.raabe@mandiant.com>
Date: Mon, 26 Aug 2024 09:52:12 +0000
Subject: [PATCH 104/105] rename using dashes for consistency

---
 scripts/{capa_as_library.py => capa-as-library.py} | 0
 tests/test_scripts.py                              | 2 +-
 2 files changed, 1 insertion(+), 1 deletion(-)
 rename scripts/{capa_as_library.py => capa-as-library.py} (100%)

diff --git a/scripts/capa_as_library.py b/scripts/capa-as-library.py
similarity index 100%
rename from scripts/capa_as_library.py
rename to scripts/capa-as-library.py
diff --git a/tests/test_scripts.py b/tests/test_scripts.py
index 35bf5347f..097f11e3a 100644
--- a/tests/test_scripts.py
+++ b/tests/test_scripts.py
@@ -65,7 +65,7 @@ def get_rule_path():
         pytest.param("show-features.py", ["-F", "0x407970", get_binary_file_path()]),
         pytest.param("show-features.py", ["-P", "MicrosoftEdgeUpdate.exe", get_report_file_path()]),
         pytest.param("show-unused-features.py", [get_binary_file_path()]),
-        pytest.param("capa_as_library.py", [get_binary_file_path()]),
+        pytest.param("capa-as-library.py", [get_binary_file_path()]),
     ],
 )
 def test_scripts(script, args):

From 9eab7eb143800c1031028ec0e31a59aba9ecd4ca Mon Sep 17 00:00:00 2001
From: mr-tz <moritz.raabe@mandiant.com>
Date: Mon, 26 Aug 2024 10:11:51 +0000
Subject: [PATCH 105/105] update names

---
 tests/fixtures.py     | 2 +-
 tests/test_scripts.py | 5 +++--
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/tests/fixtures.py b/tests/fixtures.py
index ddd6e87ed..41a656dd9 100644
--- a/tests/fixtures.py
+++ b/tests/fixtures.py
@@ -416,7 +416,7 @@ def get_data_path_by_name(name) -> Path:
             / "data"
             / "dynamic"
             / "vmray"
-            / "93b2d1840566f45fab674ebc79a9d19c88993bcb645e0357f3cb584d16e7c795_archive.zip"
+            / "93b2d1840566f45fab674ebc79a9d19c88993bcb645e0357f3cb584d16e7c795_min_archive.zip"
         )
     elif name.startswith("ea2876"):
         return CD / "data" / "ea2876e9175410b6f6719f80ee44b9553960758c7d0f7bed73c0fe9a78d8e669.dll_"
diff --git a/tests/test_scripts.py b/tests/test_scripts.py
index 097f11e3a..06a6e9fef 100644
--- a/tests/test_scripts.py
+++ b/tests/test_scripts.py
@@ -27,7 +27,7 @@ def get_binary_file_path():
     return str(CD / "data" / "9324d1a8ae37a36ae560c37448c9705a.exe_")
 
 
-def get_report_file_path():
+def get_cape_report_file_path():
     return str(
         CD
         / "data"
@@ -63,9 +63,10 @@ def get_rule_path():
         pytest.param("show-capabilities-by-function.py", [get_binary_file_path()]),
         pytest.param("show-features.py", [get_binary_file_path()]),
         pytest.param("show-features.py", ["-F", "0x407970", get_binary_file_path()]),
-        pytest.param("show-features.py", ["-P", "MicrosoftEdgeUpdate.exe", get_report_file_path()]),
+        pytest.param("show-features.py", ["-P", "MicrosoftEdgeUpdate.exe", get_cape_report_file_path()]),
         pytest.param("show-unused-features.py", [get_binary_file_path()]),
         pytest.param("capa-as-library.py", [get_binary_file_path()]),
+        # not testing "minimize-vmray-results.py" as we don't currently upload full VMRay analysis archives
     ],
 )
 def test_scripts(script, args):