Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update machine.py to reflect usfm changes in machine #103

Merged
merged 3 commits into from
Apr 4, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 3 additions & 2 deletions machine/corpora/corpora_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,9 @@

import regex as re

from ..scripture import ENGLISH_VERSIFICATION
from ..scripture.canon import book_id_to_number
from ..scripture.verse_ref import VERSE_RANGE_SEPARATOR, VERSE_SEQUENCE_INDICATOR, Versification, VersificationType
from ..scripture.verse_ref import VERSE_RANGE_SEPARATOR, VERSE_SEQUENCE_INDICATOR, Versification

T = TypeVar("T")

Expand Down Expand Up @@ -107,7 +108,7 @@ def get_usx_versification(project_dir: Path, versification: Optional[Versificati
if versification is None and versification_filename.is_file():
versification_name = project_dir.name
versification = Versification.load(versification_filename, fallback_name=versification_name)
return Versification.get_builtin(VersificationType.ENGLISH) if versification is None else versification
return ENGLISH_VERSIFICATION if versification is None else versification


def merge_verse_ranges(verse1: str, verse2: str) -> str:
Expand Down
5 changes: 3 additions & 2 deletions machine/corpora/dbl_bundle_text_corpus.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,8 @@
from typing import List
from zipfile import ZipFile

from ..scripture.verse_ref import Versification, VersificationType
from ..scripture import ENGLISH_VERSIFICATION
from ..scripture.verse_ref import Versification
from ..utils.typeshed import StrPath
from .scripture_text_corpus import ScriptureTextCorpus
from .usx_zip_text import UsxZipText
Expand Down Expand Up @@ -32,7 +33,7 @@ def __init__(self, filename: StrPath) -> None:
TextIOWrapper(stream, encoding="utf-8-sig"), "versification.vrs", fallback_name=abbr
)
else:
versification = Versification.get_builtin(VersificationType.ENGLISH)
versification = ENGLISH_VERSIFICATION

texts: List[UsxZipText] = []
for content_elem in doc.getroot().findall("./publications/publication[@default='true']/structure/content"):
Expand Down
27 changes: 27 additions & 0 deletions machine/corpora/file_paratext_project_settings_parser.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
from pathlib import Path
from typing import BinaryIO, Optional

from ..utils.typeshed import StrPath
from .paratext_project_settings_parser_base import ParatextProjectSettingsParserBase
from .usfm_stylesheet import UsfmStylesheet


class FileParatextProjectSettingsParser(ParatextProjectSettingsParserBase):
def __init__(self, project_dir: StrPath) -> None:
self._project_dir = Path(project_dir)

def create_stylesheet(self, file_name: StrPath) -> UsfmStylesheet:
custom_stylesheet_filename = self._project_dir / file_name
return UsfmStylesheet(
file_name,
custom_stylesheet_filename if custom_stylesheet_filename.is_file() else None,
)

def exists(self, file_name: StrPath) -> bool:
return (self._project_dir / file_name).is_file()

def find(self, extension: str) -> Optional[Path]:
return next(self._project_dir.glob(f"*{extension}"), None)

def open(self, file_name: StrPath) -> BinaryIO:
return open(self._project_dir / file_name, "rb")
105 changes: 105 additions & 0 deletions machine/corpora/paratext_backup_terms_corpus.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,105 @@
import re
import xml.etree.ElementTree as ET
from typing import Dict, List, Optional
from zipfile import ZipFile

from .corpora_utils import get_entry
from .dictionary_text_corpus import DictionaryTextCorpus
from .memory_text import MemoryText
from .text_row import TextRow
from .zip_paratext_project_settings_parser import ZipParatextProjectSettingsParser

_PREDEFINED_TERMS_LIST_TYPES = ["Major", "All", "SilNt", "Pt6"]


class ParatextBackupTermsCorpus(DictionaryTextCorpus):
def __init__(self, filename: str, term_categories: List[str]) -> None:
rows: List[TextRow] = []
with ZipFile(filename, "r") as archive:
terms_file_entry = get_entry(archive, "TermRenderings.xml")
if terms_file_entry is None:
return
settings_parser = ZipParatextProjectSettingsParser(archive)
settings = settings_parser.parse()

with archive.open(terms_file_entry) as key_terms_file:
term_renderings_tree = ET.parse(key_terms_file)

biblical_terms_file_entry = get_entry(archive, settings.biblical_terms_file_name)
if settings.biblical_terms_list_type in _PREDEFINED_TERMS_LIST_TYPES:
with open(settings.biblical_terms_file_name, "rb") as key_terms_file:
biblical_terms_tree = ET.parse(key_terms_file)
term_id_to_category_dict = _get_category_per_id(biblical_terms_tree)
elif (
settings.biblical_terms_list_type == "Project"
and settings.biblical_terms_project_name == settings.name
and biblical_terms_file_entry is not None
):
with archive.open(biblical_terms_file_entry) as key_terms_file:
biblical_terms_tree = ET.parse(key_terms_file)
term_id_to_category_dict = _get_category_per_id(biblical_terms_tree)
else:
term_id_to_category_dict = {}

terms_elements = term_renderings_tree.iter(".//TermRendering")
text_id = (
f"{settings.biblical_terms_list_type}:"
f"{settings.biblical_terms_project_name}:"
f"{settings.biblical_terms_file_name}"
)
for e in terms_elements:
term_id = e.attrib["Id"]
category = term_id_to_category_dict.get(term_id, "")
if term_categories and (category == "" or category not in term_categories):
continue
term_id = term_id.replace("\n", "&#xA")
rendering = e.findtext("Renderings", "")
renderings = _get_renderings(rendering)
rows.append(TextRow(text_id, term_id, segment=renderings))
text = MemoryText(text_id, rows)
self._add_text(text)


def _get_renderings(rendering: str) -> List[str]:
# If entire term rendering is surrounded in square brackets, remove them
match = re.match(r"^\[(.+?)\]$", rendering)
if match:
rendering = match.group(1)
rendering = rendering.replace("?", "")
rendering = rendering.replace("*", "")
rendering = rendering.replace("/", " ")
rendering = rendering.strip()
rendering = _strip_parens(rendering)
rendering = _strip_parens(rendering, left="[", right="]")
rx = re.compile(r"\s+\d+(\.\d+)*$")
for match in rx.findall(rendering):
rendering = rendering.replace(match, "")
glosses = re.split(r"\|\|", rendering)
glosses = list(set(g.strip() for g in glosses if g.strip() != ""))
return glosses


def _strip_parens(term_string: str, left: str = "(", right: str = ")") -> str:
parens = 0
end = -1
for i in range(len(term_string) - 1, -1, -1):
c = term_string[i]
if c == right:
if parens == 0:
end = i + 1
parens += 1
elif c == left:
if parens > 0:
parens -= 1
if parens == 0:
term_string = term_string[:i] + term_string[end:]
return term_string


def _get_category_per_id(biblical_terms_tree: ET.ElementTree) -> Dict[str, Optional[str]]:
term_id_to_category_dict = {}
for e in biblical_terms_tree.iter(".//Term"):
category_element = e.find("Category")
category = category_element.text if category_element is not None and category_element.text is not None else ""
term_id_to_category_dict[e.attrib["Id"]] = category
return term_id_to_category_dict
117 changes: 15 additions & 102 deletions machine/corpora/paratext_backup_text_corpus.py
Original file line number Diff line number Diff line change
@@ -1,121 +1,34 @@
import xml.etree.ElementTree as etree
from io import TextIOWrapper
from tempfile import TemporaryFile
from typing import List, Optional
from zipfile import ZipFile, ZipInfo
from typing import List
from zipfile import ZipFile

import regex as re

from ..scripture.verse_ref import Versification
from ..utils.file_utils import detect_encoding_from_stream
from ..utils.string_utils import parse_integer
from ..utils.typeshed import StrPath
from .corpora_utils import find_entry, get_encoding, get_entry
from .scripture_text_corpus import ScriptureTextCorpus
from .usfm_stylesheet import UsfmStylesheet
from .usfm_zip_text import UsfmZipText
from .zip_paratext_project_settings_parser import ZipParatextProjectSettingsParser


class ParatextBackupTextCorpus(ScriptureTextCorpus):
def __init__(self, filename: StrPath, include_markers: bool = False) -> None:
with ZipFile(filename, "r") as archive:
settings_entry = get_entry(archive, "Settings.xml")
if settings_entry is None:
settings_entry = find_entry(archive, lambda zi: zi.filename.endswith(".ssf"))
if settings_entry is None:
raise ValueError("The project backup does not contain a settings file.")
parser = ZipParatextProjectSettingsParser(archive)
settings = parser.parse()

with archive.open(settings_entry, "r") as file:
settings_tree = etree.parse(file)
versification = settings.versification
regex = re.compile(f"^{re.escape(settings.file_name_prefix)}.*{re.escape(settings.file_name_suffix)}$")

encoding_str = settings_tree.getroot().findtext("Encoding", "65001")
code_page = parse_integer(encoding_str)
if code_page is None:
raise NotImplementedError(
f"The project uses a legacy encoding that requires TECKit, map file: {encoding_str}."
)
encoding = get_encoding(code_page)
if encoding is None:
raise RuntimeError(f"Code page {code_page} not supported.")

versification_type = int(settings_tree.getroot().findtext("Versification", "4"))
versification = Versification.get_builtin(versification_type)
custom_versification_entry = get_entry(archive, "custom.vrs")
if custom_versification_entry is not None:
guid = settings_tree.getroot().findtext("Guid", "")
versification_name = f"{versification.name}-{guid}"
try:
versification = _load_versification_from_entry(
archive,
custom_versification_entry,
"custom.vrs",
versification,
versification_name,
encoding="utf-8-sig",
)
except UnicodeDecodeError:
with archive.open(custom_versification_entry, "r") as file:
vers_encoding = detect_encoding_from_stream(file)
versification = _load_versification_from_entry(
archive,
custom_versification_entry,
"custom.vrs",
versification,
versification_name,
vers_encoding,
)

stylesheet_name = settings_tree.getroot().findtext("StyleSheet", "usfm.sty")
stylesheet_entry = get_entry(archive, stylesheet_name)
if stylesheet_entry is None and stylesheet_name != "usfm_sb.sty":
stylesheet_entry = get_entry(archive, "usfm.sty")
custom_stylesheet_entry = get_entry(archive, "custom.sty")
with TemporaryFile() as stylesheet_temp_file, TemporaryFile() as custom_stylesheet_temp_file:
stylesheet_path = "usfm.sty"
if stylesheet_entry is not None:
with archive.open(stylesheet_entry, "r") as file:
stylesheet_temp_file.write(file.read())
stylesheet_path = stylesheet_temp_file.name
stylesheet_temp_file.close()
custom_stylesheet_path: Optional[str] = None
if custom_stylesheet_entry is not None:
with archive.open(custom_stylesheet_entry, "r") as file:
custom_stylesheet_temp_file.write(file.read())
custom_stylesheet_path = custom_stylesheet_temp_file.name
custom_stylesheet_temp_file.close()
stylesheet = UsfmStylesheet(stylesheet_path, custom_stylesheet_path)

prefix = ""
suffix = ".SFM"
naming_elem = settings_tree.getroot().find("Naming")
if naming_elem is not None:
pre_part = naming_elem.get("PrePart", "")
if pre_part != "":
prefix = pre_part
post_part = naming_elem.get("PostPart", "")
if post_part != "":
suffix = post_part

regex = re.compile(f"^{re.escape(prefix)}.*{re.escape(suffix)}$")
texts: List[UsfmZipText] = []
for sfm_entry in (zi for zi in archive.filelist if regex.match(zi.filename)):
texts.append(
UsfmZipText(stylesheet, encoding, filename, sfm_entry.filename, versification, include_markers)
UsfmZipText(
settings.stylesheet,
settings.encoding,
filename,
sfm_entry.filename,
versification,
include_markers,
)
)

super().__init__(versification, texts)


def _load_versification_from_entry(
archive: ZipFile,
entry: ZipInfo,
filename: StrPath,
base_versification: Versification,
fallback_name: str,
encoding: str,
) -> Versification:
with archive.open(entry, "r") as file:
stream = TextIOWrapper(file, encoding=encoding)
return Versification.parse(
stream, filename, Versification(fallback_name, filename, base_versification), fallback_name
)
44 changes: 44 additions & 0 deletions machine/corpora/paratext_project_settings.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
from dataclasses import dataclass

from ..scripture.canon import book_id_to_number
from ..scripture.verse_ref import Versification
from .usfm_stylesheet import UsfmStylesheet


@dataclass
class ParatextProjectSettings:
name: str
full_name: str
encoding: str
versification: Versification
stylesheet: UsfmStylesheet
file_name_prefix: str
file_name_form: str
file_name_suffix: str
biblical_terms_list_type: str
biblical_terms_project_name: str
biblical_terms_file_name: str

def get_book_file_name(self, book_id: str) -> str:
if self.file_name_form == "MAT":
book_part = book_id
elif self.file_name_form in ("40", "41"):
book_part = _get_book_file_name_digits(book_id)
else:
book_part = _get_book_file_name_digits(book_id) + book_id
return self.file_name_prefix + book_part + self.file_name_suffix


def _get_book_file_name_digits(book_id: str) -> str:
book_num = book_id_to_number(book_id)
if book_num < 10:
return f"0{book_num}"
if book_num < 40:
return str(book_num)
if book_num < 100:
return str(book_num + 1)
if book_num < 110:
return f"A{book_num - 100}"
if book_num < 120:
return f"B{book_num - 110}"
return f"C{book_num - 120}"
Loading
Loading