Port USFM code from Machine up to commit a9058ce (#111)

* port commit 436a67d that moved logic to parallel text corpus * port commit 436a67d that adds test cases for scripture text corpus * port commit 29c9799 that adds support for mixed source corpora * port commit fa65835, default to major biblical terms * port commit a9058ce, support for non-verse text segments in Scripture corpora * handle overloading, update __init__.py, use is_scripture, move files to corpora folder, use top level constant for empty ScriptureRef, change __eq__ back to exact_equals, keep test files consistent
sillsdev · Aug 15, 2024 · 953f203 · 953f203
1 parent 2f7f44f
commit 953f203
Show file tree

Hide file tree

Showing 35 changed files with 1,397 additions and 461 deletions.
diff --git a/machine/corpora/__init__.py b/machine/corpora/__init__.py
@@ -15,7 +15,15 @@
 from .parallel_text_row import ParallelTextRow
 from .paratext_backup_text_corpus import ParatextBackupTextCorpus
 from .paratext_text_corpus import ParatextTextCorpus
-from .scripture_text_corpus import ScriptureTextCorpus, create_versification_ref_corpus, extract_scripture_corpus
+from .scripture_element import ScriptureElement
+from .scripture_ref import EMPTY_SCRIPTURE_REF, ScriptureRef
+from .scripture_ref_usfm_parser_handler import ScriptureRefUsfmParserHandler, ScriptureTextType
+from .scripture_text_corpus import (
+    ScriptureTextCorpus,
+    create_versification_ref_corpus,
+    extract_scripture_corpus,
+    is_scripture,
+)
 from .standard_parallel_text_corpus import StandardParallelTextCorpus
 from .text import Text
 from .text_corpus import TextCorpus
@@ -41,6 +49,7 @@
 from .usfm_parser_state import UsfmElementType, UsfmParserElement, UsfmParserState
 from .usfm_stylesheet import UsfmStylesheet
 from .usfm_tag import UsfmJustification, UsfmStyleAttribute, UsfmStyleType, UsfmTag, UsfmTextProperties, UsfmTextType
+from .usfm_text_updater import UsfmTextUpdater
 from .usfm_token import UsfmAttribute, UsfmToken, UsfmTokenType
 from .usfm_tokenizer import RtlReferenceOrder, UsfmTokenizer
 from .usx_file_alignment_collection import UsxFileAlignmentCollection
@@ -60,9 +69,11 @@
     "DblBundleTextCorpus",
     "DictionaryAlignmentCorpus",
     "DictionaryTextCorpus",
+    "EMPTY_SCRIPTURE_REF",
     "escape_spaces",
     "extract_scripture_corpus",
     "flatten",
+    "is_scripture",
     "lowercase",
     "MemoryAlignmentCollection",
     "MemoryText",
@@ -78,7 +89,11 @@
     "ParatextTextCorpus",
     "parse_usfm",
     "RtlReferenceOrder",
+    "ScriptureElement",
+    "ScriptureRef",
+    "ScriptureRefUsfmParserHandler",
     "ScriptureTextCorpus",
+    "ScriptureTextType",
     "StandardParallelTextCorpus",
     "Text",
     "TextCorpus",
@@ -104,6 +119,7 @@
     "UsfmTag",
     "UsfmTextProperties",
     "UsfmTextType",
+    "UsfmTextUpdater",
     "UsfmToken",
     "UsfmTokenizer",
     "UsfmTokenType",

diff --git a/machine/corpora/dictionary_text_corpus.py b/machine/corpora/dictionary_text_corpus.py
@@ -1,5 +1,6 @@
 from typing import Iterable, Optional, overload
 
+from ..scripture.verse_ref import Versification
 from .text import Text
 from .text_corpus import TextCorpus
 
@@ -21,6 +22,7 @@ def __init__(self, *args, **kwargs) -> None:
             texts = args[0]
         self._texts = {t.id: t for t in texts}
         self._is_tokenized = False
+        self._versification = None
 
     @property
     def texts(self) -> Iterable[Text]:
@@ -34,6 +36,14 @@ def is_tokenized(self) -> bool:
     def is_tokenized(self, value: bool) -> None:
         self._is_tokenized = value
 
+    @property
+    def versification(self) -> Optional[Versification]:
+        return self._versification
+
+    @versification.setter
+    def versification(self, value: Versification) -> None:
+        self._versification = value
+
     def __getitem__(self, id: str) -> Optional[Text]:
         return self._texts.get(id)
 

diff --git a/machine/corpora/flatten.py b/machine/corpora/flatten.py
@@ -1,6 +1,7 @@
 from itertools import chain
 from typing import Generator, Iterable, List, Optional, cast, overload
 
+from ..scripture.verse_ref import Versification
 from .alignment_collection import AlignmentCollection
 from .alignment_corpus import AlignmentCorpus
 from .alignment_row import AlignmentRow
@@ -54,6 +55,10 @@ def texts(self) -> Iterable[Text]:
     def is_tokenized(self) -> bool:
         return all(c.is_tokenized for c in self._corpora)
 
+    @property
+    def versification(self) -> Optional[Versification]:
+        return self._corpora[0].versification if len(self._corpora) > 0 else None
+
     def count(self, include_empty: bool = True) -> int:
         return sum(c.count(include_empty) for c in self._corpora)
 

diff --git a/machine/corpora/parallel_text_row.py b/machine/corpora/parallel_text_row.py
@@ -18,6 +18,8 @@ def __init__(
         source_flags: TextRowFlags = TextRowFlags.SENTENCE_START,
         target_flags: TextRowFlags = TextRowFlags.SENTENCE_START,
     ) -> None:
+        if not text_id:
+            raise ValueError("A text_id must be set.")
         if len(source_refs) == 0 and len(target_refs) == 0:
             raise ValueError("Either a source or target ref must be set.")
         self._text_id = text_id

diff --git a/machine/corpora/paratext_backup_text_corpus.py b/machine/corpora/paratext_backup_text_corpus.py
@@ -10,7 +10,7 @@
 
 
 class ParatextBackupTextCorpus(ScriptureTextCorpus):
-    def __init__(self, filename: StrPath, include_markers: bool = False) -> None:
+    def __init__(self, filename: StrPath, include_markers: bool = False, include_all_text: bool = False) -> None:
         with ZipFile(filename, "r") as archive:
             parser = ZipParatextProjectSettingsParser(archive)
             settings = parser.parse()
@@ -28,6 +28,7 @@ def __init__(self, filename: StrPath, include_markers: bool = False) -> None:
                         sfm_entry.filename,
                         versification,
                         include_markers,
+                        include_all_text,
                     )
                 )
 

diff --git a/machine/corpora/paratext_project_settings_parser_base.py b/machine/corpora/paratext_project_settings_parser_base.py
@@ -73,8 +73,16 @@ def parse(self) -> ParatextProjectSettings:
             post_part = naming_elem.get("PostPart")
             if post_part:
                 suffix = post_part
-        biblical_terms = settings_tree.getroot().findtext("BiblicalTermsListSetting", "")
-        parts = biblical_terms.split(":", 2)
+        biblical_terms_list_setting = settings_tree.getroot().findtext("BiblicalTermsListSetting", "")
+        if biblical_terms_list_setting is None:
+            # Default to Major::BiblicalTerms.xml to mirror Paratext behavior
+            biblical_terms_list_setting = "Major::BiblicalTerms.xml"
+        parts = biblical_terms_list_setting.split(":", 2)
+        if len(parts) != 3:
+            raise ValueError(
+                f"The BiblicalTermsListSetting element in Settings.xml in project {full_name}"
+                f" is not in the expected format (i.e., Major::BiblicalTerms.xml) but is {biblical_terms_list_setting}."
+            )
 
         return ParatextProjectSettings(
             name, full_name, encoding, versification, stylesheet, prefix, form, suffix, parts[0], parts[1], parts[2]

diff --git a/machine/corpora/paratext_text_corpus.py b/machine/corpora/paratext_text_corpus.py
@@ -8,7 +8,7 @@
 
 
 class ParatextTextCorpus(ScriptureTextCorpus):
-    def __init__(self, project_dir: StrPath, include_markers: bool = False) -> None:
+    def __init__(self, project_dir: StrPath, include_markers: bool = False, include_all_text: bool = False) -> None:
         parser = FileParatextProjectSettingsParser(project_dir)
         settings = parser.parse()
 
@@ -17,7 +17,14 @@ def __init__(self, project_dir: StrPath, include_markers: bool = False) -> None:
         texts: List[UsfmFileText] = []
         for sfm_filename in Path(project_dir).glob(f"{settings.file_name_prefix}*{settings.file_name_suffix}"):
             texts.append(
-                UsfmFileText(settings.stylesheet, settings.encoding, sfm_filename, versification, include_markers)
+                UsfmFileText(
+                    settings.stylesheet,
+                    settings.encoding,
+                    sfm_filename,
+                    versification,
+                    include_markers,
+                    include_all_text,
+                )
             )
 
         super().__init__(versification, texts)
diff --git a/machine/corpora/scripture_element.py b/machine/corpora/scripture_element.py
@@ -0,0 +1,54 @@
+from __future__ import annotations
+
+from functools import total_ordering
+from typing import Optional
+
+from ..utils.comparable import Comparable
+
+
+@total_ordering
+class ScriptureElement(Comparable):
+    def __init__(self, position: int, name: str) -> None:
+        self._position = position
+        self._name = name
+
+    @property
+    def position(self) -> int:
+        return self._position
+
+    @property
+    def name(self) -> str:
+        return self._name
+
+    def compare_to(self, other: object, strict: Optional[bool] = True) -> int:
+        if not isinstance(other, ScriptureElement):
+            raise (TypeError("other is not a ScriptureElement object."))
+        if self is other:
+            return 0
+
+        if strict:
+            res = self.position - other.position
+            if res != 0:
+                return res
+
+        return (self.name > other.name) - (self.name < other.name)
+
+    def __eq__(self, other: ScriptureElement) -> bool:
+        if not isinstance(other, ScriptureElement):
+            return NotImplemented
+
+        return self.position == other.position and self.name == other.name
+
+    def __lt__(self, other: ScriptureElement) -> bool:
+        if not isinstance(other, ScriptureElement):
+            return NotImplemented
+
+        return self.compare_to(other) < 0
+
+    def __hash__(self) -> int:
+        return hash((self.position, self.name))
+
+    def __repr__(self):
+        if self.position == 0:
+            return self.name
+        return f"{self.position}:{self.name}"
diff --git a/machine/corpora/scripture_ref.py b/machine/corpora/scripture_ref.py
@@ -0,0 +1,128 @@
+from __future__ import annotations
+
+from functools import total_ordering
+from typing import List, Optional
+
+from ..scripture.constants import ENGLISH_VERSIFICATION
+from ..scripture.verse_ref import VerseRef, Versification, are_overlapping_verse_ranges
+from ..utils.comparable import Comparable
+from .scripture_element import ScriptureElement
+
+
+@total_ordering
+class ScriptureRef(Comparable):
+    def __init__(self, ref: Optional[VerseRef] = None, path: Optional[List[ScriptureElement]] = None) -> None:
+        self._verse_ref: VerseRef = ref if ref is not None else VerseRef()
+        self._path: List[ScriptureElement] = path if path is not None else []
+
+    _empty: Optional[ScriptureRef] = None
+
+    @classmethod
+    def parse(cls, selection: str, versification: Optional[Versification] = None) -> ScriptureRef:
+        parts: List[str] = selection.split("/")
+        if len(parts) == 1:
+            return cls(
+                VerseRef.from_string(parts[0], versification if versification is not None else ENGLISH_VERSIFICATION)
+            )
+        vref: str = parts[0]
+        path: List[ScriptureElement] = []
+        for part in parts[1:]:
+            elem: List[str] = part.split(":")
+            if len(elem) == 1:
+                path.append(ScriptureElement(0, elem[0]))
+            else:
+                path.append(ScriptureElement(int(elem[0]), elem[1]))
+
+        return cls(
+            VerseRef.from_string(vref, versification if versification is not None else ENGLISH_VERSIFICATION), path
+        )
+
+    @property
+    def verse_ref(self) -> VerseRef:
+        return self._verse_ref
+
+    @property
+    def path(self) -> List[ScriptureElement]:
+        return self._path
+
+    @property
+    def book_num(self) -> int:
+        return self.verse_ref.book_num
+
+    @property
+    def chapter_num(self) -> int:
+        return self.verse_ref.chapter_num
+
+    @property
+    def verse_num(self) -> int:
+        return self.verse_ref.verse_num
+
+    @property
+    def book(self) -> str:
+        return self.verse_ref.book
+
+    @property
+    def chapter(self) -> str:
+        return self.verse_ref.chapter
+
+    @property
+    def verse(self) -> str:
+        return self.verse_ref.verse
+
+    @property
+    def versification(self) -> Versification:
+        return self.verse_ref.versification
+
+    @property
+    def is_empty(self) -> bool:
+        return self.verse_ref.is_default
+
+    @property
+    def is_verse(self) -> bool:
+        return VerseRef.verse_num != 0 and len(self.path) == 0
+
+    def change_versification(self, versification: Versification) -> ScriptureRef:
+        vr: VerseRef = self.verse_ref.copy()
+        vr.change_versification(versification)
+        return ScriptureRef(vr, self.path)
+
+    def overlaps(self, other: ScriptureRef) -> bool:
+        if not are_overlapping_verse_ranges(self.verse_ref, other.verse_ref):
+            return False
+        return self.path == other.path
+
+    def compare_to(self, other: object, compare_segments: bool = True, strict: bool = True):
+        if not isinstance(other, ScriptureRef):
+            raise TypeError("other is not a ScriptureRef object.")
+        if self is other:
+            return 0
+
+        res = self.verse_ref.compare_to(other.verse_ref, compare_segments=compare_segments)
+        if res != 0:
+            return res
+
+        for se1, se2 in zip(self.path, other.path):
+            res = se1.compare_to(se2, strict=strict)
+            if res != 0:
+                return res
+
+        return len(self.path) - len(other.path)
+
+    def __eq__(self, other: object) -> bool:
+        if not isinstance(other, ScriptureRef):
+            return NotImplemented
+        return self.verse_ref == other.verse_ref and self.path == other.path
+
+    def __lt__(self, other: object) -> bool:
+        if not isinstance(other, ScriptureRef):
+            return NotImplemented
+        return self.compare_to(other) < 0
+
+    def __hash__(self) -> int:
+        return hash((self.verse_ref, tuple(self.path)))
+
+    def __repr__(self) -> str:
+        return f"{self.verse_ref}/{'/'.join(str(se) for se in self.path)}"
+
+
+EMPTY_SCRIPTURE_REF = ScriptureRef()