1.8.0: add regexploit

Signed-off-by: rjdbcm <[email protected]>
OZI-Project · Dec 10, 2024 · 5dea24f · 5dea24f
1 parent 6c83218
commit 5dea24f
Show file tree

Hide file tree

Showing 17 changed files with 1,319 additions and 5 deletions.
diff --git a/README.rst b/README.rst
@@ -4,21 +4,38 @@ OZI.build
 
 This is the `OZI-Project <https://github.com/OZI-Project>`_ maintained fork of the mesonpep517 0.2 tag.
 
-This is a simple module that implements pep517 for the meson build system.
+This is a module that implements PEP-517 for the meson build system.
 
 This means that you only need to provide a ``pyproject.toml`` in your project
 source root to be able to publish your project built with meson on PyPI
 and to create a wheel for the project.
 
+Other features include:
+
+* compiling modules to bytecode with pyc_wheel
+* scanning ``pyproject.toml`` for exploitable ReDoS patterns with regexploit
+
 For more information have a look at `the documentation <https://docs.oziproject.dev/en/stable/ozi_build.html>`_
 
-OZI.build is licensed under Apache-2.0 and includes ``pyc_wheel`` and
-portions of ``wheel`` whose copyright information is reproduced here.
+License
+-------
+
+OZI.build is licensed under Apache-2.0 and includes ``regexploit``,
+``pyc_wheel`` and portions of ``wheel`` whose copyright information is
+reproduced here.
+
+Apache-2.0 contributors
+^^^^^^^^^^^^^^^^^^^^^^^
+
+``regexploit`` Copyright (c) 2021 Ben Caller <[email protected]>
 
 ``pyc_wheel`` Copyright (c) 2016 Grant Patten <[email protected]>
 
 ``pyc_wheel`` Copyright (c) 2019-2021 Adam Karpierz <[email protected]>
 
+MIT contributors
+^^^^^^^^^^^^^^^^
+
 ``wheel`` Copyright (c) 2012-2014 Daniel Holth <[email protected]> and contributors.
 
 Permission is hereby granted, free of charge, to any person obtaining a copy

diff --git a/meson.build b/meson.build
@@ -1,4 +1,4 @@
-project('OZI.build', version : '1.7.2', license : 'apache-2.0')
+project('OZI.build', version : '1.8.0', license : 'apache-2.0')
 fs = import('fs')
 python = import('python').find_installation()
 subdir('ozi_build')

diff --git a/ozi_build/_at.py b/ozi_build/_at.py
@@ -0,0 +1,58 @@
+from dataclasses import dataclass
+from typing import List, Optional
+
+from ._char import Character
+from ._repeat import InfiniteRepeat, Repeat
+
+
+@dataclass
+class EndOfString:
+    character: Optional[Character] = None
+
+    @property
+    def starriness(self):
+        return 0
+
+    @property
+    def minimum_length(self):
+        return 1  # Meaningless really here
+
+    def overall_character_class(self):
+        return self.character
+
+    def __repr__(self) -> str:
+        return f"${self.character}"
+
+    def __and__(self, other: Character) -> Optional[Character]:
+        return other & self.character
+
+    def example(self):
+        return "\n"  # ish
+
+    def set_character(self, previous_elems: List):
+        """
+        To force backtracking, the dollar will have to not match any previous groups until a mandatory group.
+        This can perhaps be made more lenient.
+
+        To cause backtracking on a long string of a's:
+        a*a*a*$ -> Any [^a]
+        [ab]+a*a*a*$ -> Any [^ab] (baaaaaaaaaaaab does not backtrack)
+        b+a*a*a*$ -> Any [^a]
+        .a*a*a*$ -> Any [^a]
+        .+a*a*a*$ -> Cannot backtrack because everything gets matched by .+ :(
+        """
+        self.character = None
+        for elem in reversed(previous_elems):
+            if elem.minimum_length > 0 and not isinstance(elem, InfiniteRepeat):
+                return  # xa*[ab]*a*$ -> [ab]
+            c = (
+                elem.maximal_character_class()
+                if isinstance(elem, Repeat)
+                else elem.overall_character_class()
+            )
+            if c:
+                if elem.minimum_length > 0 and (self.character & c) != self.character:
+                    # c is smaller than self.character (i.e. c is not an ANY)
+                    # x+a*[ab]*a*$ -> [ab]
+                    return
+                self.character |= c
diff --git a/ozi_build/_branch.py b/ozi_build/_branch.py
@@ -0,0 +1,81 @@
+from dataclasses import dataclass
+from typing import Iterator, List, Optional
+
+from ._at import EndOfString
+from ._char import Character
+from ._repeat import FiniteRepeat, InfiniteRepeat
+from ._sequence import Sequence
+
+
+@dataclass(frozen=True)
+class Branch:
+    branches: List
+    optional: bool = False
+
+    def get_branches(self) -> Iterator:
+        for b in self.branches:
+            yield b
+        if self.optional:
+            yield None
+
+    @property
+    def starriness(self) -> int:
+        return max(b.starriness for b in self.branches)
+
+    @property
+    def minimum_length(self) -> int:
+        return 0 if self.optional else min(b.minimum_length for b in self.branches)
+
+    def overall_character_class(self) -> Optional[Character]:
+        c = Character.ANY()
+        for b in self.branches:
+            c &= b.overall_character_class()
+            if c is None:
+                return None
+        return c
+
+    def maximal_character_class(self):
+        return None  # Really?
+
+    def example(self) -> str:
+        if self.optional:
+            return ""
+        return self.branches[0].example()
+
+    def __len__(self) -> int:
+        return len(self.branches) + int(self.optional)
+
+    def __repr__(self) -> str:
+        middle = " | ".join(str(b) for b in self.branches)
+        return f"BR( {middle} ){'?' if self.optional else ''}"
+
+    def matching_repeats(self):
+        for b in self.branches:
+            if b.starriness > 0:
+                if isinstance(b, InfiniteRepeat):
+                    yield b
+                elif isinstance(b, Sequence):
+                    yield from b.matching_repeats()
+
+
+def make_branch(branches: List):
+    if len(branches) == 1:
+        return branches[0]
+    optional = False
+    non_empty_branches = [b for b in branches if b and not isinstance(b, EndOfString)]
+    if not non_empty_branches:
+        return None
+    if len(non_empty_branches) < len(branches):
+        # (ab|cd|) -> (ab|cd)?
+        optional = True
+    if all(isinstance(b, Character) for b in non_empty_branches):
+        # (a|b) -> [ab], (a|b|) -> [ab]?
+        c = None
+        for b in non_empty_branches:
+            c |= b
+        if optional:
+            return FiniteRepeat(c, 0, 1)
+        else:
+            return c
+
+    return Branch(non_empty_branches, optional)
diff --git a/ozi_build/_categories.py b/ozi_build/_categories.py
@@ -0,0 +1,93 @@
+import sys
+import unicodedata
+from enum import Enum, auto
+from typing import Set
+
+
+class Category(Enum):
+    DIGIT = auto()
+    NOT_DIGIT = auto()
+    WORD = auto()
+    NOT_WORD = auto()
+    SPACE = auto()
+    NOT_SPACE = auto()
+
+    @property
+    def is_positive(self) -> bool:
+        return not self.name.startswith("NOT_")
+
+    def negate(self) -> "Category":
+        if self.is_positive:
+            return Category[f"NOT_{self.name}"]
+        else:
+            return Category[self.name[4:]]
+
+    def example(self) -> str:
+        return EXAMPLE_FOR_CAT[self]
+
+    def contains(self, literal: int) -> bool:
+        c = chr(literal)
+        unicat = unicodedata.category(c)
+        if self is Category.DIGIT:
+            return unicat == "Nd"
+        if self is Category.NOT_DIGIT:
+            return unicat != "Nd"
+        if self is Category.WORD:
+            return (
+                unicat[0] == "L" or unicat == "Nd" or literal == 0x5F
+            )  # underscore is a word character
+        if self is Category.NOT_WORD:
+            return unicat[0] != "L" and unicat != "Nd" and literal != 0x5F
+        if self is Category.SPACE:
+            return unicat == "Zs" or c in (" ", "\n", "\t", "\r", "\f", "\v")
+        if self is Category.NOT_SPACE:
+            return unicat != "Zs" and c not in (" ", "\n", "\t", "\r", "\f", "\v")
+
+
+CATS = {}
+
+
+def list_category(category, full_unicode: bool = False):
+    if (cached := CATS.get(category)) :
+        yield from cached
+    for data in range((sys.maxunicode + 1) if full_unicode else 256):
+        c = chr(data)
+        unicat = unicodedata.category(c)
+        if category is Category.DIGIT:
+            if unicat == "Nd":
+                yield data
+        elif category is Category.NOT_DIGIT:
+            if unicat != "Nd":
+                yield data
+        elif category is Category.WORD:
+            if unicat[0] == "L" or unicat == "Nd" or data == 0x5F:
+                yield data
+        elif category is Category.NOT_WORD:
+            if unicat[0] != "L" and unicat != "Nd" and data != 0x5F:
+                yield data
+        elif category is Category.SPACE:
+            if unicat == "Zs" or c in (" ", "\n", "\t", "\r", "\f", "\v"):
+                yield data
+        elif category is Category.NOT_SPACE:
+            if unicat != "Zs" and c not in (" ", "\n", "\t", "\r", "\f", "\v"):
+                yield data
+
+
+def covers_any(categories: Set[Category]) -> bool:
+    for c in categories:
+        if c.is_positive and c.negate() in categories:
+            return True
+    return False
+
+
+# CATS[sre_parse.CATEGORY_DIGIT] = list(list_category(sre_parse.CATEGORY_DIGIT))
+# CATS[sre_parse.CATEGORY_SPACE] = list(list_category(sre_parse.CATEGORY_SPACE))
+# CATS[sre_parse.CATEGORY_WORD] = list(list_category(sre_parse.CATEGORY_WORD))
+EXAMPLE_FOR_CAT = {
+    Category.DIGIT: "4",
+    Category.NOT_DIGIT: "!",
+    Category.WORD: "w",
+    Category.NOT_WORD: "$",
+    Category.SPACE: " ",
+    Category.NOT_SPACE: ".",
+}