From 5dea24f212615e69a4f9c45f15b8b0d532ea39b9 Mon Sep 17 00:00:00 2001 From: rjdbcm Date: Mon, 9 Dec 2024 19:50:31 -0600 Subject: [PATCH] 1.8.0: add regexploit Signed-off-by: rjdbcm --- README.rst | 23 +++- meson.build | 2 +- ozi_build/_at.py | 58 ++++++++++ ozi_build/_branch.py | 81 ++++++++++++++ ozi_build/_categories.py | 93 +++++++++++++++ ozi_build/_char.py | 236 +++++++++++++++++++++++++++++++++++++++ ozi_build/_files.py | 35 ++++++ ozi_build/_groupref.py | 28 +++++ ozi_build/_ranges.py | 33 ++++++ ozi_build/_redos.py | 225 +++++++++++++++++++++++++++++++++++++ ozi_build/_repeat.py | 71 ++++++++++++ ozi_build/_sequence.py | 115 +++++++++++++++++++ ozi_build/_sre.py | 205 ++++++++++++++++++++++++++++++++++ ozi_build/_text.py | 51 +++++++++ ozi_build/_util.py | 51 +++++++++ ozi_build/config.py | 2 + ozi_build/meson.build | 15 ++- 17 files changed, 1319 insertions(+), 5 deletions(-) create mode 100644 ozi_build/_at.py create mode 100644 ozi_build/_branch.py create mode 100644 ozi_build/_categories.py create mode 100644 ozi_build/_char.py create mode 100644 ozi_build/_files.py create mode 100644 ozi_build/_groupref.py create mode 100644 ozi_build/_ranges.py create mode 100644 ozi_build/_redos.py create mode 100644 ozi_build/_repeat.py create mode 100644 ozi_build/_sequence.py create mode 100644 ozi_build/_sre.py create mode 100644 ozi_build/_text.py diff --git a/README.rst b/README.rst index dc03237..f4ba7c1 100644 --- a/README.rst +++ b/README.rst @@ -4,21 +4,38 @@ OZI.build This is the `OZI-Project `_ maintained fork of the mesonpep517 0.2 tag. -This is a simple module that implements pep517 for the meson build system. +This is a module that implements PEP-517 for the meson build system. This means that you only need to provide a ``pyproject.toml`` in your project source root to be able to publish your project built with meson on PyPI and to create a wheel for the project. +Other features include: + +* compiling modules to bytecode with pyc_wheel +* scanning ``pyproject.toml`` for exploitable ReDoS patterns with regexploit + For more information have a look at `the documentation `_ -OZI.build is licensed under Apache-2.0 and includes ``pyc_wheel`` and -portions of ``wheel`` whose copyright information is reproduced here. +License +------- + +OZI.build is licensed under Apache-2.0 and includes ``regexploit``, +``pyc_wheel`` and portions of ``wheel`` whose copyright information is +reproduced here. + +Apache-2.0 contributors +^^^^^^^^^^^^^^^^^^^^^^^ + +``regexploit`` Copyright (c) 2021 Ben Caller ``pyc_wheel`` Copyright (c) 2016 Grant Patten ``pyc_wheel`` Copyright (c) 2019-2021 Adam Karpierz +MIT contributors +^^^^^^^^^^^^^^^^ + ``wheel`` Copyright (c) 2012-2014 Daniel Holth and contributors. Permission is hereby granted, free of charge, to any person obtaining a copy diff --git a/meson.build b/meson.build index 1cc8b2f..92d7c9e 100644 --- a/meson.build +++ b/meson.build @@ -1,4 +1,4 @@ -project('OZI.build', version : '1.7.2', license : 'apache-2.0') +project('OZI.build', version : '1.8.0', license : 'apache-2.0') fs = import('fs') python = import('python').find_installation() subdir('ozi_build') diff --git a/ozi_build/_at.py b/ozi_build/_at.py new file mode 100644 index 0000000..aa6326b --- /dev/null +++ b/ozi_build/_at.py @@ -0,0 +1,58 @@ +from dataclasses import dataclass +from typing import List, Optional + +from ._char import Character +from ._repeat import InfiniteRepeat, Repeat + + +@dataclass +class EndOfString: + character: Optional[Character] = None + + @property + def starriness(self): + return 0 + + @property + def minimum_length(self): + return 1 # Meaningless really here + + def overall_character_class(self): + return self.character + + def __repr__(self) -> str: + return f"${self.character}" + + def __and__(self, other: Character) -> Optional[Character]: + return other & self.character + + def example(self): + return "\n" # ish + + def set_character(self, previous_elems: List): + """ + To force backtracking, the dollar will have to not match any previous groups until a mandatory group. + This can perhaps be made more lenient. + + To cause backtracking on a long string of a's: + a*a*a*$ -> Any [^a] + [ab]+a*a*a*$ -> Any [^ab] (baaaaaaaaaaaab does not backtrack) + b+a*a*a*$ -> Any [^a] + .a*a*a*$ -> Any [^a] + .+a*a*a*$ -> Cannot backtrack because everything gets matched by .+ :( + """ + self.character = None + for elem in reversed(previous_elems): + if elem.minimum_length > 0 and not isinstance(elem, InfiniteRepeat): + return # xa*[ab]*a*$ -> [ab] + c = ( + elem.maximal_character_class() + if isinstance(elem, Repeat) + else elem.overall_character_class() + ) + if c: + if elem.minimum_length > 0 and (self.character & c) != self.character: + # c is smaller than self.character (i.e. c is not an ANY) + # x+a*[ab]*a*$ -> [ab] + return + self.character |= c diff --git a/ozi_build/_branch.py b/ozi_build/_branch.py new file mode 100644 index 0000000..050a182 --- /dev/null +++ b/ozi_build/_branch.py @@ -0,0 +1,81 @@ +from dataclasses import dataclass +from typing import Iterator, List, Optional + +from ._at import EndOfString +from ._char import Character +from ._repeat import FiniteRepeat, InfiniteRepeat +from ._sequence import Sequence + + +@dataclass(frozen=True) +class Branch: + branches: List + optional: bool = False + + def get_branches(self) -> Iterator: + for b in self.branches: + yield b + if self.optional: + yield None + + @property + def starriness(self) -> int: + return max(b.starriness for b in self.branches) + + @property + def minimum_length(self) -> int: + return 0 if self.optional else min(b.minimum_length for b in self.branches) + + def overall_character_class(self) -> Optional[Character]: + c = Character.ANY() + for b in self.branches: + c &= b.overall_character_class() + if c is None: + return None + return c + + def maximal_character_class(self): + return None # Really? + + def example(self) -> str: + if self.optional: + return "" + return self.branches[0].example() + + def __len__(self) -> int: + return len(self.branches) + int(self.optional) + + def __repr__(self) -> str: + middle = " | ".join(str(b) for b in self.branches) + return f"BR( {middle} ){'?' if self.optional else ''}" + + def matching_repeats(self): + for b in self.branches: + if b.starriness > 0: + if isinstance(b, InfiniteRepeat): + yield b + elif isinstance(b, Sequence): + yield from b.matching_repeats() + + +def make_branch(branches: List): + if len(branches) == 1: + return branches[0] + optional = False + non_empty_branches = [b for b in branches if b and not isinstance(b, EndOfString)] + if not non_empty_branches: + return None + if len(non_empty_branches) < len(branches): + # (ab|cd|) -> (ab|cd)? + optional = True + if all(isinstance(b, Character) for b in non_empty_branches): + # (a|b) -> [ab], (a|b|) -> [ab]? + c = None + for b in non_empty_branches: + c |= b + if optional: + return FiniteRepeat(c, 0, 1) + else: + return c + + return Branch(non_empty_branches, optional) diff --git a/ozi_build/_categories.py b/ozi_build/_categories.py new file mode 100644 index 0000000..4cc10ae --- /dev/null +++ b/ozi_build/_categories.py @@ -0,0 +1,93 @@ +import sys +import unicodedata +from enum import Enum, auto +from typing import Set + + +class Category(Enum): + DIGIT = auto() + NOT_DIGIT = auto() + WORD = auto() + NOT_WORD = auto() + SPACE = auto() + NOT_SPACE = auto() + + @property + def is_positive(self) -> bool: + return not self.name.startswith("NOT_") + + def negate(self) -> "Category": + if self.is_positive: + return Category[f"NOT_{self.name}"] + else: + return Category[self.name[4:]] + + def example(self) -> str: + return EXAMPLE_FOR_CAT[self] + + def contains(self, literal: int) -> bool: + c = chr(literal) + unicat = unicodedata.category(c) + if self is Category.DIGIT: + return unicat == "Nd" + if self is Category.NOT_DIGIT: + return unicat != "Nd" + if self is Category.WORD: + return ( + unicat[0] == "L" or unicat == "Nd" or literal == 0x5F + ) # underscore is a word character + if self is Category.NOT_WORD: + return unicat[0] != "L" and unicat != "Nd" and literal != 0x5F + if self is Category.SPACE: + return unicat == "Zs" or c in (" ", "\n", "\t", "\r", "\f", "\v") + if self is Category.NOT_SPACE: + return unicat != "Zs" and c not in (" ", "\n", "\t", "\r", "\f", "\v") + + +CATS = {} + + +def list_category(category, full_unicode: bool = False): + if (cached := CATS.get(category)) : + yield from cached + for data in range((sys.maxunicode + 1) if full_unicode else 256): + c = chr(data) + unicat = unicodedata.category(c) + if category is Category.DIGIT: + if unicat == "Nd": + yield data + elif category is Category.NOT_DIGIT: + if unicat != "Nd": + yield data + elif category is Category.WORD: + if unicat[0] == "L" or unicat == "Nd" or data == 0x5F: + yield data + elif category is Category.NOT_WORD: + if unicat[0] != "L" and unicat != "Nd" and data != 0x5F: + yield data + elif category is Category.SPACE: + if unicat == "Zs" or c in (" ", "\n", "\t", "\r", "\f", "\v"): + yield data + elif category is Category.NOT_SPACE: + if unicat != "Zs" and c not in (" ", "\n", "\t", "\r", "\f", "\v"): + yield data + + +def covers_any(categories: Set[Category]) -> bool: + for c in categories: + if c.is_positive and c.negate() in categories: + return True + return False + + +# CATS[sre_parse.CATEGORY_DIGIT] = list(list_category(sre_parse.CATEGORY_DIGIT)) +# CATS[sre_parse.CATEGORY_SPACE] = list(list_category(sre_parse.CATEGORY_SPACE)) +# CATS[sre_parse.CATEGORY_WORD] = list(list_category(sre_parse.CATEGORY_WORD)) +EXAMPLE_FOR_CAT = { + Category.DIGIT: "4", + Category.NOT_DIGIT: "!", + Category.WORD: "w", + Category.NOT_WORD: "$", + Category.SPACE: " ", + Category.NOT_SPACE: ".", +} diff --git a/ozi_build/_char.py b/ozi_build/_char.py new file mode 100644 index 0000000..865665a --- /dev/null +++ b/ozi_build/_char.py @@ -0,0 +1,236 @@ +import string +from dataclasses import dataclass +from typing import Optional, Set + +from ._categories import Category, covers_any, list_category +from ._ranges import Range, lits_to_ranges + + +@dataclass(frozen=True) +class Character: + literals: Optional[Set[int]] = None + categories: Optional[Set[Category]] = None + positive: bool = True + + @staticmethod + def ANY() -> "Character": + return Character() + + @staticmethod + def LITERAL(literal: int) -> "Character": + return Character({literal}) + + @property + def minimum_length(self) -> int: + return 1 + + @property + def starriness(self) -> int: + return 0 + + def __hash__(self) -> int: + return hash( + ( + self.positive, + tuple(sorted(self.literals)) if self.literals else None, + tuple(sorted(self.categories)) if self.categories else None, + ) + ) + + def exact_character_class(self) -> "Character": + return self + + def overall_character_class(self) -> "Character": + return self + + def maximal_character_class(self) -> "Character": + return self + + @property + def is_any(self) -> bool: + return self.literals is None and self.categories is None and self.positive + + @property + def _is_positive_literal(self) -> bool: + return self.positive and self.literals is not None and self.categories is None + + @property + def _is_negative_literal(self) -> bool: + return ( + not self.positive and self.literals is not None and self.categories is None + ) + + @property + def _is_positive_category(self) -> bool: + return self.positive and self.literals is None and self.categories is not None + + @property + def _is_negative_category(self) -> bool: + return ( + not self.positive and self.literals is None and self.categories is not None + ) + + def expand_categories(self) -> "Character": + """ + This is the nuclear option where we expand the categories into literals. + Can be huge in unicode. + """ + if self.categories: + lits: Set[int] = set(self.literals) if self.literals else set() + for c in self.categories: + lits.update(list_category(c)) + return Character(literals=lits, positive=self.positive) + + return self + + def __and__(self, other: "Optional[Character]") -> "Optional[Character]": + if other is None: + return None + if self.is_any: + return other + if other.is_any: + return self + + # [ab] & [bc] -> [c] + if self._is_positive_literal and other._is_positive_literal: + lits = self.literals & other.literals + if not lits: + return None + return Character(literals=lits) + if self._is_positive_category and other._is_positive_category: + cats = self.categories & other.categories + if not cats: + return None + return Character(categories=cats) + # [^ab] & [^bc] -> [^abc] + if self._is_negative_literal and other._is_negative_literal: + return Character(literals=self.literals | other.literals, positive=False) + if self._is_negative_category and other._is_negative_category: + categories = self.categories | other.categories + if covers_any(categories): # [^\d] & [^\D] = nothing + return None + return Character(categories=categories, positive=False) + # [ab] & [^bc] -> [a] + if self._is_positive_literal and other._is_negative_literal: + lits = self.literals - other.literals + if not lits: + return None + return Character(literals=lits) + if other._is_positive_literal and self._is_negative_literal: + lits = other.literals - self.literals + if not lits: + return None + return Character(literals=lits) + + # TODO: be less lazy and sort out the general case without expanding everything if possible + return self.expand_categories() & other.expand_categories() + + def __rand__(self, other: "Optional[Character]") -> "Optional[Character]": + return self & other + + def __or__(self, other: "Optional[Character]") -> "Optional[Character]": + if other is None: + return self + if self.is_any or other.is_any: + return Character.ANY() + if self == other: + return self + if nor := (self.negate() & other.negate()): # Slow, but logical + return nor.negate() + else: + return Character.ANY() + + def __ror__(self, other: "Optional[Character]") -> "Optional[Character]": + return self | other + + def __repr__(self) -> str: + if self.is_any: + return "." + result = "[" + if not self.positive: + result += "^" + more = False + if self.literals is not None: + lits, ranges = lits_to_ranges(self.literals) + result += ",".join(literal_repr(o) for o in lits) + if lits and ranges: + result += "," + result += ",".join(range_repr(r) for r in ranges) + more = True + if self.categories is not None: + if more: + result += ";" + result += ",".join(c.name for c in self.categories) + more = True + return result + "]" + + def example(self) -> str: + for c in nice_characters(): + if self.matches(c): + return chr(c) + + if self.positive: + if self.literals: + if len(self.literals) > 1: + # Try to avoid \n due to false positives with the . character and flags + return chr(next(o for o in self.literals if o != 0xA)) + return chr(next(iter(self.literals))) + elif self.categories: + return sorted(self.categories, key=lambda c: 0 if c.is_positive else 1)[ + 0 + ].example() + + raise NotImplementedError(self) + + def negate(self) -> "Optional[Character]": + if self.is_any: + return None + return Character( + literals=self.literals, + categories=self.categories, + positive=not self.positive, + ) + + def contains(self, subgroup: "Character") -> bool: + if self.is_any: + return True + if subgroup.is_any: + return False + if subgroup == self: + return True + + if self._is_positive_literal and subgroup._is_positive_literal: + return not (subgroup.literals - self.literals) + if self._is_positive_category and subgroup._is_positive_category: + return not (subgroup.categories - self.categories) + + raise NotImplementedError # Lazy, TODO: do full match + + def matches(self, literal: int) -> bool: + if self.is_any: + return True + if self.literals is not None and literal in self.literals: + return self.positive + if self.categories: + for cat in self.categories: + if cat.contains(literal): + return self.positive + return not self.positive + + +def nice_characters(): + for c in string.printable[:-5]: + yield ord(c) + + +def literal_repr(literal: int) -> str: + c = chr(literal) + if c in string.digits or c in string.ascii_letters: + return c + elif c in string.punctuation: + return f"{literal:02x}:{c}" + return f"{literal:02x}" + + +def range_repr(r: Range) -> str: + return "[{}-{}]".format(literal_repr(r.min_val), literal_repr(r.max_val)) diff --git a/ozi_build/_files.py b/ozi_build/_files.py new file mode 100644 index 0000000..d350721 --- /dev/null +++ b/ozi_build/_files.py @@ -0,0 +1,35 @@ +import os +import os.path +from glob import iglob +from typing import List, Optional + + +def _file_generator( + files_argument: List[str], is_glob: bool, filename_globs: List[str] +): + if is_glob: + for fglob in files_argument: + yield from iglob(fglob, recursive=True) + else: + for f in files_argument: + if os.path.isdir(f): + for g in filename_globs: + yield from iglob(os.path.join(f, "**", g), recursive=True) + else: + yield f + + +def file_generator( + files_argument: List[str], + is_glob: bool, + filename_globs: List[str], + ignore: Optional[List[str]] = None, +): + gen = _file_generator(files_argument, is_glob, filename_globs) + if ignore: + for f in gen: + if any(i in f for i in ignore): + continue + yield f + else: + yield from gen diff --git a/ozi_build/_groupref.py b/ozi_build/_groupref.py new file mode 100644 index 0000000..68a4712 --- /dev/null +++ b/ozi_build/_groupref.py @@ -0,0 +1,28 @@ +from ._repeat import FiniteRepeat, InfiniteRepeat +from ._branch import Branch +from ._sequence import Sequence + + +def subpattern_to_groupref(subpattern): + if subpattern is None: + return None + if subpattern.starriness == 0: + return subpattern + if isinstance(subpattern, FiniteRepeat): + return subpattern.alter_repeat( + subpattern_to_groupref(subpattern.repeat), + ) + if isinstance(subpattern, InfiniteRepeat): + return FiniteRepeat( + subpattern_to_groupref(subpattern.repeat), + subpattern.minimum_repeats, + subpattern.minimum_repeats + 1, + ) + if isinstance(subpattern, Branch): + return Branch( + [subpattern_to_groupref(b) for b in subpattern.branches], + subpattern.optional, + ) + if isinstance(subpattern, Sequence): + return Sequence([subpattern_to_groupref(e) for e in subpattern.elements]) + return subpattern diff --git a/ozi_build/_ranges.py b/ozi_build/_ranges.py new file mode 100644 index 0000000..c62bd46 --- /dev/null +++ b/ozi_build/_ranges.py @@ -0,0 +1,33 @@ +from dataclasses import dataclass +from typing import Iterator, List, Set, Tuple + + +@dataclass(frozen=True) +class Range: + min_val: int + max_val: int + + +def lits_to_ranges( + literals: Iterator[int], +) -> Tuple[Set[int], Set[Range]]: + lits = set() + ranges = set() + buf: List[int] = [] + for lit in sorted(literals): + if len(buf) and buf[-1] != lit - 1: + # Discontinuity + if len(buf) < 3: + lits.update(buf) + else: + ranges.add(Range(buf[0], buf[-1])) + buf = [lit] + else: + buf.append(lit) + + if len(buf) == 1: + lits.add(buf[0]) + elif len(buf) > 1: + ranges.add(Range(buf[0], buf[-1])) + + return lits, ranges diff --git a/ozi_build/_redos.py b/ozi_build/_redos.py new file mode 100644 index 0000000..2d47320 --- /dev/null +++ b/ozi_build/_redos.py @@ -0,0 +1,225 @@ +import logging +from dataclasses import dataclass +from typing import Iterator, List, Optional + +from ._at import EndOfString +from ._branch import Branch +from ._char import Character +from ._repeat import InfiniteRepeat, Repeat +from ._sequence import Sequence + + +@dataclass(frozen=True) +class Redos: + starriness: int + prefix_sequence: Sequence + redos_sequence: Sequence + repeated_character: Character + killer: Optional[Character] + + @property + def example_prefix(self) -> str: + return self.prefix_sequence.example() + + def example(self, js_flavour: bool = False) -> str: + repeated_char = self.repeated_character + killer = self.killer + # Try to find a repeating character which is also a killer + if killer and (killing_repeat := repeated_char & killer): + repeated_char = killing_repeat + killer = None + + prefix = ( + self.example_prefix.encode("unicode_escape").decode().replace("'", "\\'") + ) + repeated_char_s = ( + repeated_char.example() + .encode("unicode_escape") + .decode() + .replace("'", "\\'") + ) + e = f"'{prefix}' + " if prefix else "" + if js_flavour: + e += f"'{repeated_char_s}'.repeat(3456)" + else: + e += f"'{repeated_char_s}' * 3456" + + if killer: + killer_s = ( + killer.example().encode("unicode_escape").decode().replace("'", "\\'") + ) + return e + f" + '{killer_s}'" + return e + + +def find(sequence, flags: int = 0) -> List[Redos]: + """ + Returns Redos objects sorted by severity (most starry first), then sorted by example_prefix (shortest first). + """ + redos = [] + for r in find_redos(sequence): + if r not in redos: + redos.append(r) + return sorted(redos, key=lambda r: -r.starriness * 1000 + len(r.example_prefix)) + + +def expand_branches(seq: Sequence) -> Iterator[Sequence]: + """ + This could blow up exponentially, but it's nicer for now to expand branches. + """ + head = [] + for i, elem in enumerate(seq.elements): + if isinstance(elem, Branch): + for b in elem.get_branches(): + head_plus_branch = head + ( + [] if not b else [b] if not isinstance(b, Sequence) else b.elements + ) + for tail in expand_branches(Sequence(seq.elements[i + 1 :])): + yield Sequence(head_plus_branch + tail.elements) + return # All processing in yields + elif isinstance(elem, Repeat) and elem.starriness > 10: + logging.debug("Exponential: %s", elem) + if isinstance(elem.repeat, (Sequence, Branch)): + for tail in expand_branches(Sequence(seq.elements[i + 1 :])): + yield Sequence(head + [elem] + tail.elements) + for pseudo_repeat in elem.repeat.matching_repeats(): + logging.debug("Pseudo repeat %s", pseudo_repeat) + yield Sequence( + head + [elem.alter_repeat(pseudo_repeat)] + tail.elements + ) + else: + head.append(elem) + else: + head.append(elem) + yield Sequence(head) + + +def find_redos(sequence_with_branches) -> Iterator[Redos]: + logging.debug(sequence_with_branches) + if not isinstance( + sequence_with_branches, Sequence + ): # singleton like Branch (ab|cd) + sequence_with_branches = Sequence([sequence_with_branches]) + for seq in expand_branches(sequence_with_branches): + yield from find_redos_in_branchless_sequence(seq) + + +def find_redos_in_branchless_sequence(seq: Sequence) -> Iterator[Redos]: + logging.debug(seq) + for i, elem in enumerate(seq.elements): + # TODO branches + if isinstance(elem, InfiniteRepeat) and (c := elem.overall_character_class()): + yield from make_redos(seq, i, i + 1, c, elem.starriness) + + +def make_redos( + seq: Sequence, + sequence_start: int, + continue_from: int, + repeated_character: Character, + starriness: int, +) -> Iterator[Redos]: + # TODO branches + character_history = [repeated_character] + logging.debug( + "Make ReDoS %d %d %s %d", + sequence_start, + continue_from, + repeated_character, + starriness, + ) + for current_index in range(continue_from, len(seq)): + elem = seq.elements[current_index] + + if isinstance(elem, EndOfString): + # May need to go back before the matching sequence to calculate $ + elem.set_character(seq.elements[:current_index]) + + eoc = elem.overall_character_class() + new_c = repeated_character & eoc + logging.debug("%s & %s = %s (for %s)", repeated_character, eoc, new_c, elem) + + # Handle optional elements + if elem.minimum_length == 0: + if elem.starriness: + # If we have a*, we branch and try with and without it + if new_c != repeated_character: + # Only branch if we have [ab]a* : if we have aa* or a[ab]* then the character class doesn't change + # Try without this element + yield from make_redos( + seq, + sequence_start, + current_index + 1, + repeated_character, + starriness, + ) + else: + continue # Don't care about finite repeats (abc)? or a{,4} + + # print(repeated_character, "+", elem.overall_character_class(), "->", new_c) + if new_c is None: + # This element will force backtracking as it's incompatible with `repeated_character` + if elem.minimum_length and starriness > 2: + yield redos_found( + seq, + sequence_start, + current_index, + repeated_character, + starriness, + None, + ) + return + + starriness += elem.starriness + repeated_character = new_c + character_history.append(new_c) + + # Everything matched! We need to work backwards and find a 'killer' to cause backtracking if we want ReDoS + logging.debug("Backtracking: %s", character_history) + for current_index in reversed(range(continue_from, len(seq))): + elem = seq.elements[current_index] + character_history.pop() + starriness -= elem.starriness + if starriness <= 2: + return + # Can't get backtracking by not matching optional groups + if elem.minimum_length > 0: + # Find a character which matches the sequence and then fails on the killer + if (match := elem.overall_character_class()) and (killer := match.negate()): + old_repeat = character_history.pop() + logging.debug( + "%s (for %s): killer=%s, repeat=%s", + match, + elem, + killer, + old_repeat, + ) + yield redos_found( + seq, + sequence_start, + current_index, + old_repeat, + starriness, + killer, + ) + return + logging.debug("Backtracking: FAIL") + + +def redos_found( + seq: Sequence, + start: int, + backtrack_at: int, + repeated_character: Character, + starriness: int, + killer: Optional[Character], +) -> Redos: + # TODO: Try to include some skipped optional parts (like `?`) just to make it nicer + logging.debug("ReDoS found") + return Redos( + starriness, + Sequence(seq.elements[:start]), + Sequence(seq.elements[start : backtrack_at + 1]), + repeated_character, + killer, + ) diff --git a/ozi_build/_repeat.py b/ozi_build/_repeat.py new file mode 100644 index 0000000..9885cec --- /dev/null +++ b/ozi_build/_repeat.py @@ -0,0 +1,71 @@ +from dataclasses import dataclass +from typing import Any, Optional + +from ._char import Character + + +@dataclass(frozen=True) +class Repeat: + repeat: Any + minimum_repeats: int + + def example(self) -> str: + if self.minimum_repeats == 0: + return "" + return self.repeat.example() * self.minimum_repeats + + @property + def minimum_length(self) -> int: + return self.minimum_repeats * self.repeat.minimum_length + + @property + def starriness(self) -> int: + return self.repeat.starriness # ? and {1,30} are not that starry + + def exact_character_class(self) -> Optional[Character]: + """ + Repeated character e.g. [bc] for [bc]*, or [a] for (aaa)* + """ + return self.repeat.exact_character_class() + + def overall_character_class(self) -> Optional[Character]: + """ + (23)+ -> None, (22)* -> 2 + """ + return self.repeat.overall_character_class() + + def maximal_character_class(self) -> Character: + """ + (23)+ -> [23], (22)* -> 2, (23*)* -> [23] + Useful for finding a way to kill a sequence like a(bc*)*$ + """ + return self.repeat.maximal_character_class() + + +@dataclass(frozen=True) +class InfiniteRepeat(Repeat): + forced_starriness: Optional[int] = None + + @property + def starriness(self) -> int: + if self.forced_starriness is not None: + return self.forced_starriness + # a*a*a* is cubic whereas (a*)* is exponential but here we just call it 10 + return 1 + self.repeat.starriness * 10 + + def __repr__(self) -> str: + return f"{self.repeat}{{{self.minimum_repeats}+}}" + + def alter_repeat(self, repeat) -> "InfiniteRepeat": + return InfiniteRepeat(repeat, self.minimum_repeats) + + +@dataclass(frozen=True) +class FiniteRepeat(Repeat): + maximum_repeats: int + + def __repr__(self) -> str: + return f"{self.repeat}{{{self.minimum_repeats},{self.maximum_repeats}}}" + + def alter_repeat(self, repeat) -> "FiniteRepeat": + return FiniteRepeat(repeat, self.minimum_repeats, self.maximum_repeats) diff --git a/ozi_build/_sequence.py b/ozi_build/_sequence.py new file mode 100644 index 0000000..76b2d53 --- /dev/null +++ b/ozi_build/_sequence.py @@ -0,0 +1,115 @@ +from dataclasses import dataclass +from typing import List, Optional + +from ._char import Character +from ._repeat import InfiniteRepeat + + +@dataclass(frozen=True) +class Sequence: + elements: List + + @property + def starriness(self): + return sum(e.starriness for e in self.elements) + + def __len__(self): + return len(self.elements) + + def example(self) -> str: + return "".join(e.example() for e in self.elements) + + @property + def minimum_length(self) -> int: + accum: int = 0 + for e in self.elements: + accum += e.minimum_length + return accum + + def exact_character_class(self) -> Optional[Character]: + """ + aa*a -> a, abc -> None, [ab][abc] -> None + """ + first = self.elements[0].exact_character_class() + if first is None: + return None + for c in self.elements[1:]: + if c != first: + return None + return c + + def overall_character_class(self) -> Optional[Character]: + """ + aa*a -> a, abc -> None, [ab][abc] -> [ab] + a?b -> b, a+b -> None, [ab]+b* -> b + """ + c = Character.ANY() + for e in self.elements: + c &= e.overall_character_class() + if not c: + return None + return c + + def matching_repeats(self): + """Complicated way to get the possible character classes for a sequence""" + c = Character.ANY() + has_mandatory = False + optionals = [] + starriness = 0 + minimum_length = 0 + for e in self.elements: + if e.minimum_length: + c &= e.overall_character_class() + if not c: + return None + has_mandatory = True + starriness += e.starriness + minimum_length += e.minimum_length + elif e.starriness > 0: + optionals.append(e) + possibilities = {c: starriness} if has_mandatory else {} + for e in optionals: + if new_c := e.overall_character_class() & c: + if new_c in possibilities: + possibilities[new_c] += e.starriness + else: + possibilities[new_c] = e.starriness + + if len(possibilities) > 1: + # (a*[ab]*a*[bc]*[bcd]*.+a*)*@ has classes {.: 1, [a]: 5, [[a-b]]: 2, [[b-c]]: 3, [[b-d]]: 2, [b]: 3} + # This could blow up! + poss_chars = list(possibilities.items()) + merged_chars = {} + while poss_chars: + c_a, s_a = poss_chars.pop() + for c_b, s_b in poss_chars: + if (merged := c_a & c_b) is not None: + if merged == c_a: + possibilities[c_a] += s_b + elif merged == c_b: + possibilities[c_b] += s_a + else: + if merged not in merged_chars: + merged_chars[merged] = set() + merged_chars[merged] |= {(c_a, s_a), (c_b, s_b)} + for merged, set_of_chars in merged_chars.items(): + possibilities[merged] = sum(s for _, s in set_of_chars) + + for cc, s in possibilities.items(): + if s: + yield InfiniteRepeat(cc, minimum_length, forced_starriness=s) + + def maximal_character_class(self) -> Character: + """ + Only useful when this Sequence is inside a Repeat + a*b -> [ab], ab* -> [ab] + Since forcing backtracking for (bc*)$ + """ + c = None + for e in self.elements: + if (mcc := e.maximal_character_class()) is not None: + c = mcc | c + return c + + def __repr__(self) -> str: + return "SEQ{ " + " ".join(str(e) for e in self.elements) + " }" diff --git a/ozi_build/_sre.py b/ozi_build/_sre.py new file mode 100644 index 0000000..b27bc9f --- /dev/null +++ b/ozi_build/_sre.py @@ -0,0 +1,205 @@ +import sre_constants +import sre_parse +from typing import List, Optional, Set, Tuple, Union # noqa: I100, I201 + +from ._at import EndOfString +from ._branch import Branch, make_branch +from ._categories import Category, covers_any +from ._char import Character +from ._groupref import subpattern_to_groupref +from ._repeat import FiniteRepeat, InfiniteRepeat +from ._sequence import Sequence + +SreConstant = sre_constants._NamedIntConstant +SreOpData = Union[Tuple, List, int, SreConstant, None] +SreOp = Tuple[SreConstant, SreOpData] + + +class SreOpParser: + def __init__(self): + self._groups = {} + self.negative_lookahead: Optional[Character] = None + + def parse_sre(self, pattern: str, flags: int = 0): + return self.sequence_or_singleton(sre_parse.parse(pattern, flags)) + + def parse_op(self, op: SreConstant, data: SreOpData): + return getattr(self, f"from_{op.name}")(data) + + def sequence_or_singleton(self, ops: List[SreOp]): + elems = [] + for p in (self.parse_op(*op) for op in ops): + if p is not None: + if isinstance(p, Sequence): + elems.extend(p.elements) + else: + elems.append(p) + if len(elems) == 0: + return None + if len(elems) == 1: + return elems[0] + return Sequence(elems) + + def from_SUBPATTERN(self, data: Tuple[int, int, int, List[SreOp]]): + ref = data[0] + elements = data[3] + result = self.sequence_or_singleton(elements) + self._groups[ref] = result + return result + + def from_MAX_REPEAT( + self, + data: Tuple[ + int, + Union[int, SreConstant], + List[SreOp], + ], + ) -> Union[FiniteRepeat, InfiniteRepeat, Branch, None]: + minimum, maximum, elements = data + infinite = maximum is sre_constants.MAXREPEAT + # TODO support negative lookahead before repeat with minimum = 0 + negative_lookahead = self.use_negative_lookahead() + repeatable = self.sequence_or_singleton(elements) + if repeatable is None: + return None + if ( + minimum == 0 + and maximum == 1 + and repeatable.starriness + and not repeatable.overall_character_class() + ): + # Interesting (starry) optional sequences as branches (ab*)? -> (ab*|) + return make_branch([repeatable, None]) + if infinite: + if ( + negative_lookahead is not None + and minimum > 0 + and isinstance(repeatable, Character) + ): + return Sequence( + [ + negative_lookahead & repeatable, + InfiniteRepeat(repeatable, minimum - 1), + ] + ) + return InfiniteRepeat(repeatable, minimum) + if ( + negative_lookahead is not None + and minimum > 0 + and maximum > 1 + and isinstance(repeatable, Character) + ): + return Sequence( + [ + negative_lookahead & repeatable, + FiniteRepeat(repeatable, minimum - 1, maximum - 1), + ] + ) + return FiniteRepeat(repeatable, minimum, maximum) + + def from_MIN_REPEAT(self, data): + return self.from_MAX_REPEAT(data) + + def from_BRANCH( + self, data: Tuple[None, List[List[SreOp]]] + ) -> Union[Branch, FiniteRepeat, Character, None]: + # sre already transforms (a|b|c) -> [abc] + branches = data[1] + negative_lookahead = self.use_negative_lookahead() + processed_branches = [] + for branch in branches: + self.negative_lookahead = negative_lookahead + processed_branches.append(self.sequence_or_singleton(branch)) + self.negative_lookahead = None + return make_branch(processed_branches) + + def from_AT(self, at: SreConstant): + # TODO: handling for multiline + # TODO: handling for \\b + self.use_negative_lookahead() + if at is sre_constants.AT_END: + return EndOfString() + return None + + def from_ANY(self, _: None) -> Character: + if negative_lookahead := self.use_negative_lookahead(): + return negative_lookahead + return Character.ANY() + + def from_LITERAL(self, literal: int) -> Character: + if negative_lookahead := self.use_negative_lookahead(): + return Character.LITERAL(literal) & negative_lookahead + return Character.LITERAL(literal) + + def from_NOT_LITERAL(self, not_literal: int) -> Character: + if negative_lookahead := self.use_negative_lookahead(): + return ( + Character(literals={not_literal}, positive=False) & negative_lookahead + ) + return Character(literals={not_literal}, positive=False) + + def from_IN(self, data: List[SreOp]) -> Character: + literals: Optional[Set[int]] = None + categories: Optional[Set] = None + positive = True + if len(data) > 1 and data[0] == (sre_constants.NEGATE, None): + positive = False + data = data[1:] + for in_op, in_data in data: + if in_op is sre_constants.LITERAL: + if literals is None: + literals = set() + literals.add(in_data) + elif in_op is sre_constants.RANGE: + if literals is None: + literals = set() + min_val, max_val = in_data + literals.update(range(min_val, max_val + 1)) + elif in_op is sre_constants.CATEGORY: + if categories is None: + categories = set() + categories.add(Category[in_data.name[9:]]) + + if categories and covers_any(categories): + return self.from_ANY(None) if positive else None + if negative_lookahead := self.use_negative_lookahead(): + return Character(literals, categories, positive) & negative_lookahead + return Character(literals, categories, positive) + + def from_GROUPREF(self, ref: int): + return subpattern_to_groupref(self._groups.get(ref)) + + @staticmethod + def from_GROUPREF_EXISTS(_) -> None: + return None # No intention to implement this properly + + @staticmethod + def from_ASSERT(_) -> None: + return None # No intention to implement this properly + + def from_ASSERT_NOT(self, data) -> None: + typ, ops = data + if typ == 1: + if len(ops) == 1: + character_op = ops[0] + if character_op[0] in ( + sre_constants.LITERAL, + sre_constants.NOT_LITERAL, + sre_constants.IN, + ): + negative_lookahead = self.use_negative_lookahead() + not_assertion = self.parse_op(*character_op) + if not_assertion and (assertion := not_assertion.negate()): + self.negative_lookahead = assertion + if negative_lookahead is not None: + self.negative_lookahead &= negative_lookahead + else: + self.negative_lookahead = negative_lookahead + + return None # No intention to implement this fully + + def use_negative_lookahead(self) -> Optional[Character]: + if self.negative_lookahead is not None: + negative_lookahead = self.negative_lookahead + self.negative_lookahead = None + return negative_lookahead diff --git a/ozi_build/_text.py b/ozi_build/_text.py new file mode 100644 index 0000000..bdef2e9 --- /dev/null +++ b/ozi_build/_text.py @@ -0,0 +1,51 @@ +POLYNOMIAL_DEGREES = [ + "linear", + "quadratic", + "cubic", + "quartic", + "quintic", + "sextic", + "septic", + "octic", + "nonic", + "decic", +] + + +class TextOutput: + def __init__(self, js_flavour: bool = False): + self.first_for_regex = True + self.regexes = 0 + self.js_flavour = js_flavour + + def next(self): + """Next regex being processed.""" + self.first_for_regex = True + self.regexes += 1 + + def record(self, redos, pattern, *, filename=None, lineno=None, context=None): + if self.first_for_regex: + if filename: + if lineno is not None: + print(f"Vulnerable regex in {filename} #{lineno}") + else: + print(f"Vulnerable regex in {filename}") + print(f"Pattern: {pattern}") + if context: + print(f"Context: {context}") + print("---") + self.first_for_regex = False + print(redos) + stars = "\u2b50" * min(10, redos.starriness) + degree = ( + "exponential" + if redos.starriness > 10 + else POLYNOMIAL_DEGREES[redos.starriness - 1] + if redos.starriness > 0 + else "?" + ) + print(f"Worst-case complexity: {redos.starriness} {stars} ({degree})") + print(f"Repeated character: {redos.repeated_character}") + if redos.killer: + print(f"Final character to cause backtracking: {redos.killer}") + print(f"Example: {redos.example(self.js_flavour)}\n") diff --git a/ozi_build/_util.py b/ozi_build/_util.py index 9f80aa2..9bcdee8 100644 --- a/ozi_build/_util.py +++ b/ozi_build/_util.py @@ -2,11 +2,62 @@ import os import subprocess import sys +from fileinput import filename +import re from .pep425tags import get_abbr_impl from .pep425tags import get_abi_tag from .pep425tags import get_impl_ver from .pep425tags import get_platform_tag +from ._text import TextOutput +from ._redos import find +from ._sre import SreOpParser + + +class PotentialRedos(RuntimeError): + ... + + +def handle_file(tomldata, filename: str, output: TextOutput): + if isinstance(tomldata, (list, dict)): + TomlWalker(filename, output).handle(tomldata) + + +class TomlWalker: + def __init__(self, filename: str, output: TextOutput): + self.filename = filename + self.output = output + + def handle(self, elem): + if isinstance(elem, str) and len(elem) > 5: + try: + parsed = SreOpParser().parse_sre(elem) + except re.error: + return # We will have many strings which aren't actually regexes + try: + self.output.next() + for redos in find(parsed): + if redos.starriness > 2: + self.output.record( + redos , + elem, + filename=self.filename, + ) + raise PotentialRedos(redos, elem, filename) + except Exception as e: + raise e + elif isinstance(elem, list): + for _elem in elem: + self.handle(_elem) + elif isinstance(elem, dict): + for _elem in elem.values(): + self.handle(_elem) + + +def check_pyproject_regexes(file): + output = TextOutput() + handle_file(file, 'pyproject.toml', output) + PKG_INFO = """\ Metadata-Version: 2.2 diff --git a/ozi_build/config.py b/ozi_build/config.py index b65f98b..24ebe6b 100644 --- a/ozi_build/config.py +++ b/ozi_build/config.py @@ -3,6 +3,7 @@ import os import sys +from ._util import check_pyproject_regexes from .metadata import auto_python_version from .metadata import check_pkg_info_file from .metadata import check_requires_python @@ -27,6 +28,7 @@ class Config: def __init__(self, builddir=None): config = self.__get_config() + check_pyproject_regexes(config) self.__metadata = config['tool']['ozi-build']['metadata'] self.__entry_points = config['tool']['ozi-build'].get( 'entry-points', [] diff --git a/ozi_build/meson.build b/ozi_build/meson.build index c185eeb..b36640a 100644 --- a/ozi_build/meson.build +++ b/ozi_build/meson.build @@ -1,12 +1,24 @@ sources = [ '__init__.py', - '_pyc_wheel.py', '_util.py', + '_at.py', + '_branch.py', + '_categories.py', + '_char.py', + '_files.py', + '_groupref.py', + '_ranges.py', + '_redos.py', + '_repeat.py', + '_sequence.py', + '_sre.py', + '_text.py', 'buildapi.py', 'config.py', 'metadata.py', 'pep425tags.py', 'schema.py', + '_pyc_wheel.py', ] foreach source: sources fs.copyfile(source) @@ -16,3 +28,4 @@ foreach source: sources subdir : 'ozi_build' ) endforeach +