diff --git a/novelwriter/constants.py b/novelwriter/constants.py index 5186a5d15..bf4f58c60 100644 --- a/novelwriter/constants.py +++ b/novelwriter/constants.py @@ -60,6 +60,7 @@ class nwConst: class nwRegEx: + WORDS = r"\b[^\s\-\+\/–—\[\]:]+\b" FMT_EI = r"(? Iterable[tuple[str, str | None]]: class DocSearch: def __init__(self) -> None: - self._regEx = QRegularExpression() - self.setCaseSensitive(False) + self._regEx = re.compile("") + self._opts = re.UNICODE | re.IGNORECASE self._words = False self._escape = True return @@ -309,10 +310,9 @@ def __init__(self) -> None: def setCaseSensitive(self, state: bool) -> None: """Set the case sensitive search flag.""" - opts = QRegularExpression.PatternOption.UseUnicodePropertiesOption + self._opts = re.UNICODE if not state: - opts |= QRegularExpression.PatternOption.CaseInsensitiveOption - self._regEx.setPatternOptions(opts) + self._opts |= re.IGNORECASE return def setWholeWords(self, state: bool) -> None: @@ -329,8 +329,8 @@ def iterSearch( self, project: NWProject, search: str ) -> Iterable[tuple[NWItem, list[tuple[int, int, str]], bool]]: """Iteratively search through documents in a project.""" - self._regEx.setPattern(self._buildPattern(search)) - logger.debug("Searching with pattern '%s'", self._regEx.pattern()) + self._regEx = re.compile(self._buildPattern(search), self._opts) + logger.debug("Searching with pattern '%s'", self._regEx.pattern) storage = project.storage for item in project.tree: if item.isFileType(): @@ -340,14 +340,12 @@ def iterSearch( def searchText(self, text: str) -> tuple[list[tuple[int, int, str]], bool]: """Search a piece of text for RegEx matches.""" - rxItt = self._regEx.globalMatch(text) count = 0 capped = False results = [] - while rxItt.hasNext(): - rxMatch = rxItt.next() - pos = rxMatch.capturedStart() - num = rxMatch.capturedLength() + for match in re.finditer(self._regEx, text): + pos = match.start(0) + num = len(match.group(0)) lim = text[:pos].rfind("\n") + 1 cut = text[lim:pos].rfind(" ") + lim + 1 context = text[cut:cut+100].partition("\n")[0] @@ -366,7 +364,7 @@ def searchText(self, text: str) -> tuple[list[tuple[int, int, str]], bool]: def _buildPattern(self, search: str) -> str: """Build the search pattern string.""" if self._escape: - search = QRegularExpression.escape(search) + search = re.escape(search) if self._words: search = f"(?:^|\\b){search}(?:$|\\b)" return search diff --git a/novelwriter/core/tokenizer.py b/novelwriter/core/tokenizer.py index b977d9621..4c90127a2 100644 --- a/novelwriter/core/tokenizer.py +++ b/novelwriter/core/tokenizer.py @@ -33,7 +33,7 @@ from pathlib import Path from time import time -from PyQt5.QtCore import QCoreApplication, QRegularExpression +from PyQt5.QtCore import QCoreApplication from PyQt5.QtGui import QFont from novelwriter import CONFIG @@ -234,7 +234,7 @@ def __init__(self, project: NWProject) -> None: nwShortcode.FOOTNOTE_B: self.FMT_FNOTE, } - self._rxDialogue: list[tuple[QRegularExpression, int, int]] = [] + self._rxDialogue: list[tuple[re.Pattern, int, int]] = [] return @@ -1109,55 +1109,45 @@ def _extractFormats( # Match Markdown for regEx, fmts in self._rxMarkdown: - rxItt = regEx.globalMatch(text, 0) - while rxItt.hasNext(): - rxMatch = rxItt.next() + for match in re.finditer(regEx, text): temp.extend( - (rxMatch.capturedStart(n), rxMatch.capturedLength(n), fmt, "") + (match.start(n), match.end(n), fmt, "") for n, fmt in enumerate(fmts) if fmt > 0 ) # Match Shortcodes - rxItt = self._rxShortCodes.globalMatch(text, 0) - while rxItt.hasNext(): - rxMatch = rxItt.next() + for match in re.finditer(REGEX_PATTERNS.shortcodePlain, text): temp.append(( - rxMatch.capturedStart(1), - rxMatch.capturedLength(1), - self._shortCodeFmt.get(rxMatch.captured(1).lower(), 0), + match.start(1), match.end(1), + self._shortCodeFmt.get(match.group(1).lower(), 0), "", )) # Match Shortcode w/Values - rxItt = self._rxShortCodeVals.globalMatch(text, 0) tHandle = self._handle or "" - while rxItt.hasNext(): - rxMatch = rxItt.next() - kind = self._shortCodeVals.get(rxMatch.captured(1).lower(), 0) + for match in re.finditer(REGEX_PATTERNS.shortcodeValue, text): + kind = self._shortCodeVals.get(match.group(1).lower(), 0) temp.append(( - rxMatch.capturedStart(0), - rxMatch.capturedLength(0), + match.start(0), match.end(0), self.FMT_STRIP if kind == skip else kind, - f"{tHandle}:{rxMatch.captured(2)}", + f"{tHandle}:{match.group(2)}", )) # Match Dialogue if self._rxDialogue and hDialog: for regEx, fmtB, fmtE in self._rxDialogue: - rxItt = regEx.globalMatch(text, 0) - while rxItt.hasNext(): - rxMatch = rxItt.next() - temp.append((rxMatch.capturedStart(0), 0, fmtB, "")) - temp.append((rxMatch.capturedEnd(0), 0, fmtE, "")) + for match in re.finditer(regEx, text): + temp.append((match.start(0), 0, fmtB, "")) + temp.append((match.end(0), 0, fmtE, "")) # Post-process text and format result = text formats = [] - for pos, n, fmt, key in reversed(sorted(temp, key=lambda x: x[0])): + for pos, end, fmt, key in reversed(sorted(temp, key=lambda x: x[0])): if fmt > 0: - if n > 0: - result = result[:pos] + result[pos+n:] - formats = [(p-n if p > pos else p, f, k) for p, f, k in formats] + if end > pos: + result = result[:pos] + result[end:] + formats = [(p+pos-end if p > pos else p, f, k) for p, f, k in formats] formats.insert(0, (pos, fmt, key)) return result, formats diff --git a/novelwriter/gui/dochighlight.py b/novelwriter/gui/dochighlight.py index bb2cfc496..1af885e2f 100644 --- a/novelwriter/gui/dochighlight.py +++ b/novelwriter/gui/dochighlight.py @@ -25,10 +25,11 @@ from __future__ import annotations import logging +import re from time import time -from PyQt5.QtCore import QRegularExpression, Qt +from PyQt5.QtCore import Qt from PyQt5.QtGui import ( QBrush, QColor, QFont, QSyntaxHighlighter, QTextBlockUserData, QTextCharFormat, QTextDocument @@ -36,20 +37,16 @@ from novelwriter import CONFIG, SHARED from novelwriter.common import checkInt -from novelwriter.constants import nwHeaders, nwRegEx, nwUnicode +from novelwriter.constants import nwHeaders, nwUnicode from novelwriter.core.index import processComment from novelwriter.enum import nwComment from novelwriter.text.patterns import REGEX_PATTERNS -from novelwriter.types import QRegExUnicode logger = logging.getLogger(__name__) -SPELLRX = QRegularExpression(r"\b[^\s\-\+\/–—\[\]:]+\b") -SPELLRX.setPatternOptions(QRegExUnicode) -SPELLSC = QRegularExpression(nwRegEx.FMT_SC) -SPELLSC.setPatternOptions(QRegExUnicode) -SPELLSV = QRegularExpression(nwRegEx.FMT_SV) -SPELLSV.setPatternOptions(QRegExUnicode) +RX_WORDS = REGEX_PATTERNS.wordSplit +RX_FMT_SC = REGEX_PATTERNS.shortcodePlain +RX_FMT_SV = REGEX_PATTERNS.shortcodeValue BLOCK_NONE = 0 BLOCK_TEXT = 1 @@ -76,9 +73,9 @@ def __init__(self, document: QTextDocument) -> None: self._spellErr = QTextCharFormat() self._hStyles: dict[str, QTextCharFormat] = {} - self._minRules: list[tuple[QRegularExpression, dict[int, QTextCharFormat]]] = [] - self._txtRules: list[tuple[QRegularExpression, dict[int, QTextCharFormat]]] = [] - self._cmnRules: list[tuple[QRegularExpression, dict[int, QTextCharFormat]]] = [] + self._minRules: list[tuple[re.Pattern, dict[int, QTextCharFormat]]] = [] + self._txtRules: list[tuple[re.Pattern, dict[int, QTextCharFormat]]] = [] + self._cmnRules: list[tuple[re.Pattern, dict[int, QTextCharFormat]]] = [] self.initHighlighter() @@ -135,8 +132,7 @@ def initHighlighter(self) -> None: # Multiple or Trailing Spaces if CONFIG.showMultiSpaces: - rxRule = QRegularExpression(r"[ ]{2,}|[ ]*$") - rxRule.setPatternOptions(QRegExUnicode) + rxRule = re.compile(r"[ ]{2,}|[ ]*$", re.UNICODE) hlRule = { 0: self._hStyles["mspaces"], } @@ -145,8 +141,7 @@ def initHighlighter(self) -> None: self._cmnRules.append((rxRule, hlRule)) # Non-Breaking Spaces - rxRule = QRegularExpression(f"[{nwUnicode.U_NBSP}{nwUnicode.U_THNBSP}]+") - rxRule.setPatternOptions(QRegExUnicode) + rxRule = re.compile(f"[{nwUnicode.U_NBSP}{nwUnicode.U_THNBSP}]+", re.UNICODE) hlRule = { 0: self._hStyles["nobreak"], } @@ -237,8 +232,7 @@ def initHighlighter(self) -> None: self._cmnRules.append((rxRule, hlRule)) # Alignment Tags - rxRule = QRegularExpression(r"(^>{1,2}|<{1,2}$)") - rxRule.setPatternOptions(QRegExUnicode) + rxRule = re.compile(r"(^>{1,2}|<{1,2}$)", re.UNICODE) hlRule = { 1: self._hStyles["markup"], } @@ -246,8 +240,7 @@ def initHighlighter(self) -> None: self._txtRules.append((rxRule, hlRule)) # Auto-Replace Tags - rxRule = QRegularExpression(r"<(\S+?)>") - rxRule.setPatternOptions(QRegExUnicode) + rxRule = re.compile(r"<(\S+?)>", re.UNICODE) hlRule = { 0: self._hStyles["replace"], } @@ -409,12 +402,10 @@ def highlightBlock(self, text: str) -> None: if hRules: for rX, hRule in hRules: - rxItt = rX.globalMatch(text, xOff) - while rxItt.hasNext(): - rxMatch = rxItt.next() + for match in re.finditer(rX, text[xOff:]): for xM, hFmt in hRule.items(): - xPos = rxMatch.capturedStart(xM) - xEnd = rxMatch.capturedEnd(xM) + xPos = match.start(xM) + xOff + xEnd = match.end(xM) + xOff for x in range(xPos, xEnd): cFmt = self.format(x) if cFmt.fontStyleName() != "markup": @@ -427,8 +418,8 @@ def highlightBlock(self, text: str) -> None: self.setCurrentBlockUserData(data) if self._spellCheck: - for xPos, xLen in data.spellCheck(text, xOff): - for x in range(xPos, xPos+xLen): + for xPos, xEnd in data.spellCheck(text, xOff): + for x in range(xPos, xEnd): cFmt = self.format(x) cFmt.merge(self._spellErr) self.setFormat(x, 1, cFmt) @@ -492,22 +483,20 @@ def spellCheck(self, text: str, offset: int) -> list[tuple[int, int]]: """ if "[" in text: # Strip shortcodes - for rX in [SPELLSC, SPELLSV]: - rxItt = rX.globalMatch(text, offset) - while rxItt.hasNext(): - rxMatch = rxItt.next() - xPos = rxMatch.capturedStart(0) - xLen = rxMatch.capturedLength(0) - xEnd = rxMatch.capturedEnd(0) - text = text[:xPos] + " "*xLen + text[xEnd:] + for rX in [RX_FMT_SC, RX_FMT_SV]: + for match in re.finditer(rX, text[offset:]): + iS = match.start(0) + offset + iE = match.end(0) + offset + if iS >= 0 and iE >= 0: + text = text[:iS] + " "*(iE - iS) + text[iE:] self._spellErrors = [] - rxSpell = SPELLRX.globalMatch(text.replace("_", " "), offset) - while rxSpell.hasNext(): - rxMatch = rxSpell.next() - if not SHARED.spelling.checkWord(rxMatch.captured(0)): - if not rxMatch.captured(0).isnumeric() and not rxMatch.captured(0).isupper(): - self._spellErrors.append( - (rxMatch.capturedStart(0), rxMatch.capturedLength(0)) - ) + checker = SHARED.spelling + for match in re.finditer(RX_WORDS, text[offset:].replace("_", " ")): + if ( + (word := match.group(0)) + and not (word.isnumeric() or word.isupper() or checker.checkWord(word)) + ): + self._spellErrors.append((match.start(0) + offset, match.end(0) + offset)) + return self._spellErrors diff --git a/novelwriter/gui/editordocument.py b/novelwriter/gui/editordocument.py index e071658c7..6e46d12dc 100644 --- a/novelwriter/gui/editordocument.py +++ b/novelwriter/gui/editordocument.py @@ -107,8 +107,8 @@ def spellErrorAtPos(self, pos: int) -> tuple[str, int, int, list[str]]: text = block.text() check = pos - block.position() if check >= 0: - for cPos, cLen in data.spellErrors: - cEnd = cPos + cLen + for cPos, cEnd in data.spellErrors: + cLen = cEnd - cPos if cPos <= check <= cEnd: word = text[cPos:cEnd] return word, cPos, cLen, SHARED.spelling.suggestWords(word) diff --git a/novelwriter/text/patterns.py b/novelwriter/text/patterns.py index 4d60222e0..c3a7cf54f 100644 --- a/novelwriter/text/patterns.py +++ b/novelwriter/text/patterns.py @@ -23,52 +23,54 @@ """ from __future__ import annotations -from PyQt5.QtCore import QRegularExpression +import re from novelwriter import CONFIG from novelwriter.constants import nwRegEx -from novelwriter.types import QRegExUnicode class RegExPatterns: + # Static RegExes + _rxWords = re.compile(nwRegEx.WORDS, re.UNICODE) + _rxItalic = re.compile(nwRegEx.FMT_EI, re.UNICODE) + _rxBold = re.compile(nwRegEx.FMT_EB, re.UNICODE) + _rxStrike = re.compile(nwRegEx.FMT_ST, re.UNICODE) + _rxSCPlain = re.compile(nwRegEx.FMT_SC, re.UNICODE) + _rxSCValue = re.compile(nwRegEx.FMT_SV, re.UNICODE) + + @property + def wordSplit(self) -> re.Pattern: + """Split text into words.""" + return self._rxWords + @property - def markdownItalic(self) -> QRegularExpression: + def markdownItalic(self) -> re.Pattern: """Markdown italic style.""" - rxRule = QRegularExpression(nwRegEx.FMT_EI) - rxRule.setPatternOptions(QRegExUnicode) - return rxRule + return self._rxItalic @property - def markdownBold(self) -> QRegularExpression: + def markdownBold(self) -> re.Pattern: """Markdown bold style.""" - rxRule = QRegularExpression(nwRegEx.FMT_EB) - rxRule.setPatternOptions(QRegExUnicode) - return rxRule + return self._rxBold @property - def markdownStrike(self) -> QRegularExpression: + def markdownStrike(self) -> re.Pattern: """Markdown strikethrough style.""" - rxRule = QRegularExpression(nwRegEx.FMT_ST) - rxRule.setPatternOptions(QRegExUnicode) - return rxRule + return self._rxStrike @property - def shortcodePlain(self) -> QRegularExpression: + def shortcodePlain(self) -> re.Pattern: """Plain shortcode style.""" - rxRule = QRegularExpression(nwRegEx.FMT_SC) - rxRule.setPatternOptions(QRegExUnicode) - return rxRule + return self._rxSCPlain @property - def shortcodeValue(self) -> QRegularExpression: + def shortcodeValue(self) -> re.Pattern: """Plain shortcode style.""" - rxRule = QRegularExpression(nwRegEx.FMT_SV) - rxRule.setPatternOptions(QRegExUnicode) - return rxRule + return self._rxSCValue @property - def dialogStyle(self) -> QRegularExpression: + def dialogStyle(self) -> re.Pattern: """Dialogue detection rule based on user settings.""" symO = "" symC = "" @@ -80,34 +82,26 @@ def dialogStyle(self) -> QRegularExpression: symC += CONFIG.fmtDQuoteClose rxEnd = "|$" if CONFIG.allowOpenDial else "" - rxRule = QRegularExpression(f"\\B[{symO}].*?(?:[{symC}]\\B{rxEnd})") - rxRule.setPatternOptions(QRegExUnicode) - return rxRule + return re.compile(f"\\B[{symO}].*?(?:[{symC}]\\B{rxEnd})", re.UNICODE) @property - def dialogLine(self) -> QRegularExpression: + def dialogLine(self) -> re.Pattern: """Dialogue line rule based on user settings.""" - sym = QRegularExpression.escape(CONFIG.dialogLine) - rxRule = QRegularExpression(f"^{sym}.*?$") - rxRule.setPatternOptions(QRegExUnicode) - return rxRule + sym = re.escape(CONFIG.dialogLine) + return re.compile(f"^{sym}.*?$", re.UNICODE) @property - def narratorBreak(self) -> QRegularExpression: + def narratorBreak(self) -> re.Pattern: """Dialogue narrator break rule based on user settings.""" - sym = QRegularExpression.escape(CONFIG.narratorBreak) - rxRule = QRegularExpression(f"\\B{sym}\\S.*?\\S{sym}\\B") - rxRule.setPatternOptions(QRegExUnicode) - return rxRule + sym = re.escape(CONFIG.narratorBreak) + return re.compile(f"\\B{sym}\\S.*?\\S{sym}\\B", re.UNICODE) @property - def altDialogStyle(self) -> QRegularExpression: + def altDialogStyle(self) -> re.Pattern: """Dialogue alternative rule based on user settings.""" - symO = QRegularExpression.escape(CONFIG.altDialogOpen) - symC = QRegularExpression.escape(CONFIG.altDialogClose) - rxRule = QRegularExpression(f"\\B{symO}.*?{symC}\\B") - rxRule.setPatternOptions(QRegExUnicode) - return rxRule + symO = re.escape(CONFIG.altDialogOpen) + symC = re.escape(CONFIG.altDialogClose) + return re.compile(f"\\B{symO}.*?{symC}\\B", re.UNICODE) REGEX_PATTERNS = RegExPatterns() diff --git a/novelwriter/types.py b/novelwriter/types.py index 40f22acf2..ab684deef 100644 --- a/novelwriter/types.py +++ b/novelwriter/types.py @@ -23,7 +23,7 @@ """ from __future__ import annotations -from PyQt5.QtCore import QRegularExpression, Qt +from PyQt5.QtCore import Qt from PyQt5.QtGui import QColor, QFont, QPainter, QTextCharFormat, QTextCursor, QTextFormat from PyQt5.QtWidgets import QDialog, QDialogButtonBox, QSizePolicy, QStyle @@ -115,10 +115,6 @@ QtScrollAlwaysOff = Qt.ScrollBarPolicy.ScrollBarAlwaysOff QtScrollAsNeeded = Qt.ScrollBarPolicy.ScrollBarAsNeeded -# Other - -QRegExUnicode = QRegularExpression.PatternOption.UseUnicodePropertiesOption - # Maps FONT_WEIGHTS: dict[int, int] = { diff --git a/tests/test_core/test_core_coretools.py b/tests/test_core/test_core_coretools.py index faf11cfc4..3827db7a9 100644 --- a/tests/test_core/test_core_coretools.py +++ b/tests/test_core/test_core_coretools.py @@ -421,7 +421,7 @@ def testCoreTools_DocSearch(monkeypatch, mockGUI, fncPath, mockRnd, ipsumText): # Patterns # ======== - # Escape Using QRegularExpression + # Escape assert search._buildPattern("[A-Za-z0-9_]+") == r"\[A\-Za\-z0\-9_\]\+" # Whole Words diff --git a/tests/test_text/test_text_patterns.py b/tests/test_text/test_text_patterns.py index 1b4623fd8..7f1def906 100644 --- a/tests/test_text/test_text_patterns.py +++ b/tests/test_text/test_text_patterns.py @@ -20,28 +20,72 @@ """ from __future__ import annotations -import pytest +import re -from PyQt5.QtCore import QRegularExpression +import pytest from novelwriter import CONFIG from novelwriter.constants import nwUnicode from novelwriter.text.patterns import REGEX_PATTERNS -def allMatches(regEx: QRegularExpression, text: str) -> list[list[str]]: +def allMatches(regEx: re.Pattern, text: str) -> list[list[str]]: """Get all matches for a regex.""" result = [] - itt = regEx.globalMatch(text, 0) - while itt.hasNext(): - match = itt.next() + for match in re.finditer(regEx, text): result.append([ - (match.captured(n), match.capturedStart(n), match.capturedEnd(n)) - for n in range(match.lastCapturedIndex() + 1) + (match.group(n), match.start(n), match.end(n)) + for n in range((match.lastindex or 0) + 1) ]) return result +@pytest.mark.core +def testTextPatterns_Words(): + """Test the word split regex.""" + regEx = REGEX_PATTERNS.wordSplit + + # Spaces + assert allMatches(regEx, "one two three") == [ + [("one", 0, 3)], [("two", 4, 7)], [("three", 8, 13)] + ] + + # Hyphens + assert allMatches(regEx, "one-two-three") == [ + [("one", 0, 3)], [("two", 4, 7)], [("three", 8, 13)] + ] + + # Em Dashes + assert allMatches(regEx, "one\u2014two\u2014three") == [ + [("one", 0, 3)], [("two", 4, 7)], [("three", 8, 13)] + ] + + # Em Dashes + assert allMatches(regEx, "one\u2014two\u2014three") == [ + [("one", 0, 3)], [("two", 4, 7)], [("three", 8, 13)] + ] + + # Plus + assert allMatches(regEx, "one+two+three") == [ + [("one", 0, 3)], [("two", 4, 7)], [("three", 8, 13)] + ] + + # Slash + assert allMatches(regEx, "one/two/three") == [ + [("one", 0, 3)], [("two", 4, 7)], [("three", 8, 13)] + ] + + # Brackets + assert allMatches(regEx, "one[two]three") == [ + [("one", 0, 3)], [("two", 4, 7)], [("three", 8, 13)] + ] + + # Colon + assert allMatches(regEx, "one:two:three") == [ + [("one", 0, 3)], [("two", 4, 7)], [("three", 8, 13)] + ] + + @pytest.mark.core def testTextPatterns_Markdown(): """Test the markdown pattern regexes."""