mandiant · williballenthin · Jan 28, 2021 · Jan 26, 2021 · Jan 27, 2021 · Jan 27, 2021
diff --git a/capa/rules.py b/capa/rules.py
@@ -6,6 +6,7 @@
 #  is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and limitations under the License.
 
+import re
 import uuid
 import codecs
 import logging
@@ -600,6 +601,9 @@ def _get_ruamel_yaml_parser():
         # use block mode, not inline json-like mode
         y.default_flow_style = False
 
+        # leave quotes unchanged
+        y.preserve_quotes = True
+
         # indent lists by two spaces below their parent
         #
         #     features:
@@ -614,16 +618,20 @@ def _get_ruamel_yaml_parser():
         return y
 
     @classmethod
-    def from_yaml(cls, s):
-        # use pyyaml because it can be much faster than ruamel (pure python)
-        doc = yaml.load(s, Loader=cls._get_yaml_loader())
+    def from_yaml(cls, s, use_ruamel=False):
+        if use_ruamel:
+            # ruamel enables nice formatting and doc roundtripping with comments
+            doc = cls._get_ruamel_yaml_parser().load(s)
+        else:
+            # use pyyaml because it can be much faster than ruamel (pure python)
+            doc = yaml.load(s, Loader=cls._get_yaml_loader())
         return cls.from_dict(doc, s)
 
     @classmethod
-    def from_yaml_file(cls, path):
+    def from_yaml_file(cls, path, use_ruamel=False):
         with open(path, "rb") as f:
             try:
-                return cls.from_yaml(f.read().decode("utf-8"))
+                return cls.from_yaml(f.read().decode("utf-8"), use_ruamel=use_ruamel)
             except InvalidRule as e:
                 raise InvalidRuleWithPath(path, str(e))
 
@@ -716,7 +724,18 @@ def move_to_end(m, k):
         # tweaking `ruamel.indent()` doesn't quite give us the control we want.
         # so, add the two extra spaces that we've determined we need through experimentation.
         # see #263
-        doc = doc.replace("  description:", "    description:")
+        # only do this for the features section, so the meta description doesn't get reformatted
+        # assumes features section always exists
+        features_offset = doc.find("features")
+        doc = doc[:features_offset] + doc[features_offset:].replace("  description:", "    description:")
+
+        # for negative hex numbers, yaml dump outputs:
+        # - offset: !!int '0x-30'
+        # we prefer:
+        # - offset: -0x30
+        # the below regex makes these adjustments and while ugly, we don't have to explore the ruamel.yaml insides
+        doc = re.sub(r"!!int '0x-([0-9a-fA-F]+)'", r"-0x\1", doc)
+
         return doc
 
 

diff --git a/scripts/capafmt.py b/scripts/capafmt.py
@@ -38,6 +38,12 @@ def main(argv=None):
     )
     parser.add_argument("-v", "--verbose", action="store_true", help="Enable debug logging")
     parser.add_argument("-q", "--quiet", action="store_true", help="Disable all output but errors")
+    parser.add_argument(
+        "-c",
+        "--check",
+        action="store_true",
+        help="Don't output (reformatted) rule, only return status. 0 = no changes, 1 = would reformat",
+    )
     args = parser.parse_args(args=argv)
 
     if args.verbose:
@@ -50,12 +56,22 @@ def main(argv=None):
     logging.basicConfig(level=level)
     logging.getLogger("capafmt").setLevel(level)
 
-    rule = capa.rules.Rule.from_yaml_file(args.path)
+    rule = capa.rules.Rule.from_yaml_file(args.path, use_ruamel=True)
+    reformatted_rule = rule.to_yaml()
+
+    if args.check:
+        if rule.definition == reformatted_rule:
+            logger.info("rule is formatted correctly, nice! (%s)", rule.name)
+            return 0
+        else:
+            logger.info("rule requires reformatting (%s)", rule.name)
+            return 1
+
     if args.in_place:
         with open(args.path, "wb") as f:
-            f.write(rule.to_yaml().encode("utf-8"))
+            f.write(reformatted_rule.encode("utf-8"))
     else:
-        print(rule.to_yaml().rstrip("\n"))
+        print(reformatted_rule)
 
     return 0
 

diff --git a/scripts/lint.py b/scripts/lint.py
@@ -17,6 +17,7 @@
 import sys
 import time
 import string
+import difflib
 import hashlib
 import logging
 import os.path
@@ -25,6 +26,7 @@
 import posixpath
 
 import capa.main
+import capa.rules
 import capa.engine
 import capa.features
 import capa.features.insn
@@ -277,6 +279,32 @@ def check_features(self, ctx, features):
         return False
 
 
+class FormatSingleEmptyLineEOF(Lint):
+    name = "EOF format"
+    recommendation = "end file with a single empty line"
+
+    def check_rule(self, ctx, rule):
+        if rule.definition.endswith("\n") and not rule.definition.endswith("\n\n"):
+            return False
+        return True
+
+
+class FormatIncorrect(Lint):
+    name = "rule format incorrect"
+    recommendation_template = "use scripts/capafmt.py or adjust as follows\n{:s}"
+
+    def check_rule(self, ctx, rule):
+        actual = rule.definition
+        expected = capa.rules.Rule.from_yaml(rule.definition, use_ruamel=True).to_yaml()
+
+        if actual != expected:
+            diff = difflib.ndiff(actual.splitlines(1), expected.splitlines(1))
+            self.recommendation = self.recommendation_template.format("".join(diff))
+            return True
+
+        return False
+
+
 def run_lints(lints, ctx, rule):
     for lint in lints:
         if lint.check_rule(ctx, rule):
@@ -332,15 +360,25 @@ def lint_meta(ctx, rule):
 )
 
 
-def get_normpath(path):
-    return posixpath.normpath(path).replace(os.sep, "/")
-
-
 def lint_features(ctx, rule):
     features = get_features(ctx, rule)
     return run_feature_lints(FEATURE_LINTS, ctx, features)
 
 
+FORMAT_LINTS = (
+    FormatSingleEmptyLineEOF(),
+    FormatIncorrect(),
+)
+
+
+def lint_format(ctx, rule):
+    return run_lints(FORMAT_LINTS, ctx, rule)
+
+
+def get_normpath(path):
+    return posixpath.normpath(path).replace(os.sep, "/")
+
+
 def get_features(ctx, rule):
     # get features from rule and all dependencies including subscopes and matched rules
     features = []
@@ -391,6 +429,7 @@ def lint_rule(ctx, rule):
             lint_meta(ctx, rule),
             lint_logic(ctx, rule),
             lint_features(ctx, rule),
+            lint_format(ctx, rule),
         )
     )
 
@@ -518,6 +557,7 @@ def main(argv=None):
 
     capa.main.set_vivisect_log_level(logging.CRITICAL)
     logging.getLogger("capa").setLevel(logging.CRITICAL)
+    logging.getLogger("viv_utils").setLevel(logging.CRITICAL)
 
     time0 = time.time()
 
@@ -549,8 +589,8 @@ def main(argv=None):
 
     did_violate = lint(ctx, rules)
 
-    diff = time.time() - time0
-    logger.debug("lint ran for ~ %02d:%02d", (diff // 60), diff)
+    min, sec = divmod(time.time() - time0, 60)
+    logger.debug("lints ran for ~ %02d:%02dm", min, sec)
 
     if not did_violate:
         logger.info("no suggestions, nice!")