sillsdev · mshannon-sil · Apr 4, 2024 · Mar 27, 2024 · Mar 28, 2024 · Apr 4, 2024
diff --git a/machine/corpora/corpora_utils.py b/machine/corpora/corpora_utils.py
@@ -8,8 +8,9 @@
 
 import regex as re
 
+from ..scripture import ENGLISH_VERSIFICATION
 from ..scripture.canon import book_id_to_number
-from ..scripture.verse_ref import VERSE_RANGE_SEPARATOR, VERSE_SEQUENCE_INDICATOR, Versification, VersificationType
+from ..scripture.verse_ref import VERSE_RANGE_SEPARATOR, VERSE_SEQUENCE_INDICATOR, Versification
 
 T = TypeVar("T")
 
@@ -107,7 +108,7 @@ def get_usx_versification(project_dir: Path, versification: Optional[Versificati
     if versification is None and versification_filename.is_file():
         versification_name = project_dir.name
         versification = Versification.load(versification_filename, fallback_name=versification_name)
-    return Versification.get_builtin(VersificationType.ENGLISH) if versification is None else versification
+    return ENGLISH_VERSIFICATION if versification is None else versification
 
 
 def merge_verse_ranges(verse1: str, verse2: str) -> str:

diff --git a/machine/corpora/dbl_bundle_text_corpus.py b/machine/corpora/dbl_bundle_text_corpus.py
@@ -4,7 +4,8 @@
 from typing import List
 from zipfile import ZipFile
 
-from ..scripture.verse_ref import Versification, VersificationType
+from ..scripture import ENGLISH_VERSIFICATION
+from ..scripture.verse_ref import Versification
 from ..utils.typeshed import StrPath
 from .scripture_text_corpus import ScriptureTextCorpus
 from .usx_zip_text import UsxZipText
@@ -32,7 +33,7 @@ def __init__(self, filename: StrPath) -> None:
                         TextIOWrapper(stream, encoding="utf-8-sig"), "versification.vrs", fallback_name=abbr
                     )
             else:
-                versification = Versification.get_builtin(VersificationType.ENGLISH)
+                versification = ENGLISH_VERSIFICATION
 
         texts: List[UsxZipText] = []
         for content_elem in doc.getroot().findall("./publications/publication[@default='true']/structure/content"):

diff --git a/machine/corpora/file_paratext_project_settings_parser.py b/machine/corpora/file_paratext_project_settings_parser.py
@@ -0,0 +1,27 @@
+from pathlib import Path
+from typing import BinaryIO, Optional
+
+from ..utils.typeshed import StrPath
+from .paratext_project_settings_parser_base import ParatextProjectSettingsParserBase
+from .usfm_stylesheet import UsfmStylesheet
+
+
+class FileParatextProjectSettingsParser(ParatextProjectSettingsParserBase):
+    def __init__(self, project_dir: StrPath) -> None:
+        self._project_dir = Path(project_dir)
+
+    def create_stylesheet(self, file_name: StrPath) -> UsfmStylesheet:
+        custom_stylesheet_filename = self._project_dir / file_name
+        return UsfmStylesheet(
+            file_name,
+            custom_stylesheet_filename if custom_stylesheet_filename.is_file() else None,
+        )
+
+    def exists(self, file_name: StrPath) -> bool:
+        return (self._project_dir / file_name).is_file()
+
+    def find(self, extension: str) -> Optional[Path]:
+        return next(self._project_dir.glob(f"*{extension}"), None)
+
+    def open(self, file_name: StrPath) -> BinaryIO:
+        return open(self._project_dir / file_name, "rb")
diff --git a/machine/corpora/paratext_backup_terms_corpus.py b/machine/corpora/paratext_backup_terms_corpus.py
@@ -0,0 +1,105 @@
+import re
+import xml.etree.ElementTree as ET
+from typing import Dict, List, Optional
+from zipfile import ZipFile
+
+from .corpora_utils import get_entry
+from .dictionary_text_corpus import DictionaryTextCorpus
+from .memory_text import MemoryText
+from .text_row import TextRow
+from .zip_paratext_project_settings_parser import ZipParatextProjectSettingsParser
+
+_PREDEFINED_TERMS_LIST_TYPES = ["Major", "All", "SilNt", "Pt6"]
+
+
+class ParatextBackupTermsCorpus(DictionaryTextCorpus):
+    def __init__(self, filename: str, term_categories: List[str]) -> None:
+        rows: List[TextRow] = []
+        with ZipFile(filename, "r") as archive:
+            terms_file_entry = get_entry(archive, "TermRenderings.xml")
+            if terms_file_entry is None:
+                return
+            settings_parser = ZipParatextProjectSettingsParser(archive)
+            settings = settings_parser.parse()
+
+            with archive.open(terms_file_entry) as key_terms_file:
+                term_renderings_tree = ET.parse(key_terms_file)
+
+            biblical_terms_file_entry = get_entry(archive, settings.biblical_terms_file_name)
+            if settings.biblical_terms_list_type in _PREDEFINED_TERMS_LIST_TYPES:
+                with open(settings.biblical_terms_file_name, "rb") as key_terms_file:
+                    biblical_terms_tree = ET.parse(key_terms_file)
+                    term_id_to_category_dict = _get_category_per_id(biblical_terms_tree)
+            elif (
+                settings.biblical_terms_list_type == "Project"
+                and settings.biblical_terms_project_name == settings.name
+                and biblical_terms_file_entry is not None
+            ):
+                with archive.open(biblical_terms_file_entry) as key_terms_file:
+                    biblical_terms_tree = ET.parse(key_terms_file)
+                    term_id_to_category_dict = _get_category_per_id(biblical_terms_tree)
+            else:
+                term_id_to_category_dict = {}
+
+            terms_elements = term_renderings_tree.iter(".//TermRendering")
+            text_id = (
+                f"{settings.biblical_terms_list_type}:"
+                f"{settings.biblical_terms_project_name}:"
+                f"{settings.biblical_terms_file_name}"
+            )
+            for e in terms_elements:
+                term_id = e.attrib["Id"]
+                category = term_id_to_category_dict.get(term_id, "")
+                if term_categories and (category == "" or category not in term_categories):
+                    continue
+                term_id = term_id.replace("\n", "&#xA")
+                rendering = e.findtext("Renderings", "")
+                renderings = _get_renderings(rendering)
+                rows.append(TextRow(text_id, term_id, segment=renderings))
+            text = MemoryText(text_id, rows)
+            self._add_text(text)
+
+
+def _get_renderings(rendering: str) -> List[str]:
+    # If entire term rendering is surrounded in square brackets, remove them
+    match = re.match(r"^\[(.+?)\]$", rendering)
+    if match:
+        rendering = match.group(1)
+    rendering = rendering.replace("?", "")
+    rendering = rendering.replace("*", "")
+    rendering = rendering.replace("/", " ")
+    rendering = rendering.strip()
+    rendering = _strip_parens(rendering)
+    rendering = _strip_parens(rendering, left="[", right="]")
+    rx = re.compile(r"\s+\d+(\.\d+)*$")
+    for match in rx.findall(rendering):
+        rendering = rendering.replace(match, "")
+    glosses = re.split(r"\|\|", rendering)
+    glosses = list(set(g.strip() for g in glosses if g.strip() != ""))
+    return glosses
+
+
+def _strip_parens(term_string: str, left: str = "(", right: str = ")") -> str:
+    parens = 0
+    end = -1
+    for i in range(len(term_string) - 1, -1, -1):
+        c = term_string[i]
+        if c == right:
+            if parens == 0:
+                end = i + 1
+            parens += 1
+        elif c == left:
+            if parens > 0:
+                parens -= 1
+                if parens == 0:
+                    term_string = term_string[:i] + term_string[end:]
+    return term_string
+
+
+def _get_category_per_id(biblical_terms_tree: ET.ElementTree) -> Dict[str, Optional[str]]:
+    term_id_to_category_dict = {}
+    for e in biblical_terms_tree.iter(".//Term"):
+        category_element = e.find("Category")
+        category = category_element.text if category_element is not None and category_element.text is not None else ""
+        term_id_to_category_dict[e.attrib["Id"]] = category
+    return term_id_to_category_dict
diff --git a/machine/corpora/paratext_backup_text_corpus.py b/machine/corpora/paratext_backup_text_corpus.py
@@ -1,121 +1,34 @@
-import xml.etree.ElementTree as etree
-from io import TextIOWrapper
-from tempfile import TemporaryFile
-from typing import List, Optional
-from zipfile import ZipFile, ZipInfo
+from typing import List
+from zipfile import ZipFile
 
 import regex as re
 
-from ..scripture.verse_ref import Versification
-from ..utils.file_utils import detect_encoding_from_stream
-from ..utils.string_utils import parse_integer
 from ..utils.typeshed import StrPath
-from .corpora_utils import find_entry, get_encoding, get_entry
 from .scripture_text_corpus import ScriptureTextCorpus
-from .usfm_stylesheet import UsfmStylesheet
 from .usfm_zip_text import UsfmZipText
+from .zip_paratext_project_settings_parser import ZipParatextProjectSettingsParser
 
 
 class ParatextBackupTextCorpus(ScriptureTextCorpus):
     def __init__(self, filename: StrPath, include_markers: bool = False) -> None:
         with ZipFile(filename, "r") as archive:
-            settings_entry = get_entry(archive, "Settings.xml")
-            if settings_entry is None:
-                settings_entry = find_entry(archive, lambda zi: zi.filename.endswith(".ssf"))
-            if settings_entry is None:
-                raise ValueError("The project backup does not contain a settings file.")
+            parser = ZipParatextProjectSettingsParser(archive)
+            settings = parser.parse()
 
-            with archive.open(settings_entry, "r") as file:
-                settings_tree = etree.parse(file)
+            versification = settings.versification
+            regex = re.compile(f"^{re.escape(settings.file_name_prefix)}.*{re.escape(settings.file_name_suffix)}$")
 
-            encoding_str = settings_tree.getroot().findtext("Encoding", "65001")
-            code_page = parse_integer(encoding_str)
-            if code_page is None:
-                raise NotImplementedError(
-                    f"The project uses a legacy encoding that requires TECKit, map file: {encoding_str}."
-                )
-            encoding = get_encoding(code_page)
-            if encoding is None:
-                raise RuntimeError(f"Code page {code_page} not supported.")
-
-            versification_type = int(settings_tree.getroot().findtext("Versification", "4"))
-            versification = Versification.get_builtin(versification_type)
-            custom_versification_entry = get_entry(archive, "custom.vrs")
-            if custom_versification_entry is not None:
-                guid = settings_tree.getroot().findtext("Guid", "")
-                versification_name = f"{versification.name}-{guid}"
-                try:
-                    versification = _load_versification_from_entry(
-                        archive,
-                        custom_versification_entry,
-                        "custom.vrs",
-                        versification,
-                        versification_name,
-                        encoding="utf-8-sig",
-                    )
-                except UnicodeDecodeError:
-                    with archive.open(custom_versification_entry, "r") as file:
-                        vers_encoding = detect_encoding_from_stream(file)
-                    versification = _load_versification_from_entry(
-                        archive,
-                        custom_versification_entry,
-                        "custom.vrs",
-                        versification,
-                        versification_name,
-                        vers_encoding,
-                    )
-
-            stylesheet_name = settings_tree.getroot().findtext("StyleSheet", "usfm.sty")
-            stylesheet_entry = get_entry(archive, stylesheet_name)
-            if stylesheet_entry is None and stylesheet_name != "usfm_sb.sty":
-                stylesheet_entry = get_entry(archive, "usfm.sty")
-            custom_stylesheet_entry = get_entry(archive, "custom.sty")
-            with TemporaryFile() as stylesheet_temp_file, TemporaryFile() as custom_stylesheet_temp_file:
-                stylesheet_path = "usfm.sty"
-                if stylesheet_entry is not None:
-                    with archive.open(stylesheet_entry, "r") as file:
-                        stylesheet_temp_file.write(file.read())
-                    stylesheet_path = stylesheet_temp_file.name
-                stylesheet_temp_file.close()
-                custom_stylesheet_path: Optional[str] = None
-                if custom_stylesheet_entry is not None:
-                    with archive.open(custom_stylesheet_entry, "r") as file:
-                        custom_stylesheet_temp_file.write(file.read())
-                    custom_stylesheet_path = custom_stylesheet_temp_file.name
-                custom_stylesheet_temp_file.close()
-                stylesheet = UsfmStylesheet(stylesheet_path, custom_stylesheet_path)
-
-            prefix = ""
-            suffix = ".SFM"
-            naming_elem = settings_tree.getroot().find("Naming")
-            if naming_elem is not None:
-                pre_part = naming_elem.get("PrePart", "")
-                if pre_part != "":
-                    prefix = pre_part
-                post_part = naming_elem.get("PostPart", "")
-                if post_part != "":
-                    suffix = post_part
-
-            regex = re.compile(f"^{re.escape(prefix)}.*{re.escape(suffix)}$")
             texts: List[UsfmZipText] = []
             for sfm_entry in (zi for zi in archive.filelist if regex.match(zi.filename)):
                 texts.append(
-                    UsfmZipText(stylesheet, encoding, filename, sfm_entry.filename, versification, include_markers)
+                    UsfmZipText(
+                        settings.stylesheet,
+                        settings.encoding,
+                        filename,
+                        sfm_entry.filename,
+                        versification,
+                        include_markers,
+                    )
                 )
 
         super().__init__(versification, texts)
-
-
-def _load_versification_from_entry(
-    archive: ZipFile,
-    entry: ZipInfo,
-    filename: StrPath,
-    base_versification: Versification,
-    fallback_name: str,
-    encoding: str,
-) -> Versification:
-    with archive.open(entry, "r") as file:
-        stream = TextIOWrapper(file, encoding=encoding)
-        return Versification.parse(
-            stream, filename, Versification(fallback_name, filename, base_versification), fallback_name
-        )
diff --git a/machine/corpora/paratext_project_settings.py b/machine/corpora/paratext_project_settings.py
@@ -0,0 +1,44 @@
+from dataclasses import dataclass
+
+from ..scripture.canon import book_id_to_number
+from ..scripture.verse_ref import Versification
+from .usfm_stylesheet import UsfmStylesheet
+
+
+@dataclass
+class ParatextProjectSettings:
+    name: str
+    full_name: str
+    encoding: str
+    versification: Versification
+    stylesheet: UsfmStylesheet
+    file_name_prefix: str
+    file_name_form: str
+    file_name_suffix: str
+    biblical_terms_list_type: str
+    biblical_terms_project_name: str
+    biblical_terms_file_name: str
+
+    def get_book_file_name(self, book_id: str) -> str:
+        if self.file_name_form == "MAT":
+            book_part = book_id
+        elif self.file_name_form in ("40", "41"):
+            book_part = _get_book_file_name_digits(book_id)
+        else:
+            book_part = _get_book_file_name_digits(book_id) + book_id
+        return self.file_name_prefix + book_part + self.file_name_suffix
+
+
+def _get_book_file_name_digits(book_id: str) -> str:
+    book_num = book_id_to_number(book_id)
+    if book_num < 10:
+        return f"0{book_num}"
+    if book_num < 40:
+        return str(book_num)
+    if book_num < 100:
+        return str(book_num + 1)
+    if book_num < 110:
+        return f"A{book_num - 100}"
+    if book_num < 120:
+        return f"B{book_num - 110}"
+    return f"C{book_num - 120}"