Skip to content

Commit

Permalink
Port USFM code from Machine up to commit a9058ce (#111)
Browse files Browse the repository at this point in the history
* port commit 436a67d that moved logic to parallel text corpus

* port commit 436a67d that adds test cases for scripture text corpus

* port commit 29c9799 that adds support for mixed source corpora

* port commit fa65835, default to major biblical terms

* port commit a9058ce, support for non-verse text segments in Scripture corpora

* handle overloading, update __init__.py, use is_scripture, move files to corpora folder, use top level constant for empty ScriptureRef, change __eq__ back to exact_equals, keep test files consistent
  • Loading branch information
mshannon-sil authored Aug 15, 2024
1 parent 2f7f44f commit 953f203
Show file tree
Hide file tree
Showing 35 changed files with 1,397 additions and 461 deletions.
18 changes: 17 additions & 1 deletion machine/corpora/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,15 @@
from .parallel_text_row import ParallelTextRow
from .paratext_backup_text_corpus import ParatextBackupTextCorpus
from .paratext_text_corpus import ParatextTextCorpus
from .scripture_text_corpus import ScriptureTextCorpus, create_versification_ref_corpus, extract_scripture_corpus
from .scripture_element import ScriptureElement
from .scripture_ref import EMPTY_SCRIPTURE_REF, ScriptureRef
from .scripture_ref_usfm_parser_handler import ScriptureRefUsfmParserHandler, ScriptureTextType
from .scripture_text_corpus import (
ScriptureTextCorpus,
create_versification_ref_corpus,
extract_scripture_corpus,
is_scripture,
)
from .standard_parallel_text_corpus import StandardParallelTextCorpus
from .text import Text
from .text_corpus import TextCorpus
Expand All @@ -41,6 +49,7 @@
from .usfm_parser_state import UsfmElementType, UsfmParserElement, UsfmParserState
from .usfm_stylesheet import UsfmStylesheet
from .usfm_tag import UsfmJustification, UsfmStyleAttribute, UsfmStyleType, UsfmTag, UsfmTextProperties, UsfmTextType
from .usfm_text_updater import UsfmTextUpdater
from .usfm_token import UsfmAttribute, UsfmToken, UsfmTokenType
from .usfm_tokenizer import RtlReferenceOrder, UsfmTokenizer
from .usx_file_alignment_collection import UsxFileAlignmentCollection
Expand All @@ -60,9 +69,11 @@
"DblBundleTextCorpus",
"DictionaryAlignmentCorpus",
"DictionaryTextCorpus",
"EMPTY_SCRIPTURE_REF",
"escape_spaces",
"extract_scripture_corpus",
"flatten",
"is_scripture",
"lowercase",
"MemoryAlignmentCollection",
"MemoryText",
Expand All @@ -78,7 +89,11 @@
"ParatextTextCorpus",
"parse_usfm",
"RtlReferenceOrder",
"ScriptureElement",
"ScriptureRef",
"ScriptureRefUsfmParserHandler",
"ScriptureTextCorpus",
"ScriptureTextType",
"StandardParallelTextCorpus",
"Text",
"TextCorpus",
Expand All @@ -104,6 +119,7 @@
"UsfmTag",
"UsfmTextProperties",
"UsfmTextType",
"UsfmTextUpdater",
"UsfmToken",
"UsfmTokenizer",
"UsfmTokenType",
Expand Down
10 changes: 10 additions & 0 deletions machine/corpora/dictionary_text_corpus.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from typing import Iterable, Optional, overload

from ..scripture.verse_ref import Versification
from .text import Text
from .text_corpus import TextCorpus

Expand All @@ -21,6 +22,7 @@ def __init__(self, *args, **kwargs) -> None:
texts = args[0]
self._texts = {t.id: t for t in texts}
self._is_tokenized = False
self._versification = None

@property
def texts(self) -> Iterable[Text]:
Expand All @@ -34,6 +36,14 @@ def is_tokenized(self) -> bool:
def is_tokenized(self, value: bool) -> None:
self._is_tokenized = value

@property
def versification(self) -> Optional[Versification]:
return self._versification

@versification.setter
def versification(self, value: Versification) -> None:
self._versification = value

def __getitem__(self, id: str) -> Optional[Text]:
return self._texts.get(id)

Expand Down
5 changes: 5 additions & 0 deletions machine/corpora/flatten.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from itertools import chain
from typing import Generator, Iterable, List, Optional, cast, overload

from ..scripture.verse_ref import Versification
from .alignment_collection import AlignmentCollection
from .alignment_corpus import AlignmentCorpus
from .alignment_row import AlignmentRow
Expand Down Expand Up @@ -54,6 +55,10 @@ def texts(self) -> Iterable[Text]:
def is_tokenized(self) -> bool:
return all(c.is_tokenized for c in self._corpora)

@property
def versification(self) -> Optional[Versification]:
return self._corpora[0].versification if len(self._corpora) > 0 else None

def count(self, include_empty: bool = True) -> int:
return sum(c.count(include_empty) for c in self._corpora)

Expand Down
2 changes: 2 additions & 0 deletions machine/corpora/parallel_text_row.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,8 @@ def __init__(
source_flags: TextRowFlags = TextRowFlags.SENTENCE_START,
target_flags: TextRowFlags = TextRowFlags.SENTENCE_START,
) -> None:
if not text_id:
raise ValueError("A text_id must be set.")
if len(source_refs) == 0 and len(target_refs) == 0:
raise ValueError("Either a source or target ref must be set.")
self._text_id = text_id
Expand Down
3 changes: 2 additions & 1 deletion machine/corpora/paratext_backup_text_corpus.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@


class ParatextBackupTextCorpus(ScriptureTextCorpus):
def __init__(self, filename: StrPath, include_markers: bool = False) -> None:
def __init__(self, filename: StrPath, include_markers: bool = False, include_all_text: bool = False) -> None:
with ZipFile(filename, "r") as archive:
parser = ZipParatextProjectSettingsParser(archive)
settings = parser.parse()
Expand All @@ -28,6 +28,7 @@ def __init__(self, filename: StrPath, include_markers: bool = False) -> None:
sfm_entry.filename,
versification,
include_markers,
include_all_text,
)
)

Expand Down
12 changes: 10 additions & 2 deletions machine/corpora/paratext_project_settings_parser_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,8 +73,16 @@ def parse(self) -> ParatextProjectSettings:
post_part = naming_elem.get("PostPart")
if post_part:
suffix = post_part
biblical_terms = settings_tree.getroot().findtext("BiblicalTermsListSetting", "")
parts = biblical_terms.split(":", 2)
biblical_terms_list_setting = settings_tree.getroot().findtext("BiblicalTermsListSetting", "")
if biblical_terms_list_setting is None:
# Default to Major::BiblicalTerms.xml to mirror Paratext behavior
biblical_terms_list_setting = "Major::BiblicalTerms.xml"
parts = biblical_terms_list_setting.split(":", 2)
if len(parts) != 3:
raise ValueError(
f"The BiblicalTermsListSetting element in Settings.xml in project {full_name}"
f" is not in the expected format (i.e., Major::BiblicalTerms.xml) but is {biblical_terms_list_setting}."
)

return ParatextProjectSettings(
name, full_name, encoding, versification, stylesheet, prefix, form, suffix, parts[0], parts[1], parts[2]
Expand Down
11 changes: 9 additions & 2 deletions machine/corpora/paratext_text_corpus.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@


class ParatextTextCorpus(ScriptureTextCorpus):
def __init__(self, project_dir: StrPath, include_markers: bool = False) -> None:
def __init__(self, project_dir: StrPath, include_markers: bool = False, include_all_text: bool = False) -> None:
parser = FileParatextProjectSettingsParser(project_dir)
settings = parser.parse()

Expand All @@ -17,7 +17,14 @@ def __init__(self, project_dir: StrPath, include_markers: bool = False) -> None:
texts: List[UsfmFileText] = []
for sfm_filename in Path(project_dir).glob(f"{settings.file_name_prefix}*{settings.file_name_suffix}"):
texts.append(
UsfmFileText(settings.stylesheet, settings.encoding, sfm_filename, versification, include_markers)
UsfmFileText(
settings.stylesheet,
settings.encoding,
sfm_filename,
versification,
include_markers,
include_all_text,
)
)

super().__init__(versification, texts)
54 changes: 54 additions & 0 deletions machine/corpora/scripture_element.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
from __future__ import annotations

from functools import total_ordering
from typing import Optional

from ..utils.comparable import Comparable


@total_ordering
class ScriptureElement(Comparable):
def __init__(self, position: int, name: str) -> None:
self._position = position
self._name = name

@property
def position(self) -> int:
return self._position

@property
def name(self) -> str:
return self._name

def compare_to(self, other: object, strict: Optional[bool] = True) -> int:
if not isinstance(other, ScriptureElement):
raise (TypeError("other is not a ScriptureElement object."))
if self is other:
return 0

if strict:
res = self.position - other.position
if res != 0:
return res

return (self.name > other.name) - (self.name < other.name)

def __eq__(self, other: ScriptureElement) -> bool:
if not isinstance(other, ScriptureElement):
return NotImplemented

return self.position == other.position and self.name == other.name

def __lt__(self, other: ScriptureElement) -> bool:
if not isinstance(other, ScriptureElement):
return NotImplemented

return self.compare_to(other) < 0

def __hash__(self) -> int:
return hash((self.position, self.name))

def __repr__(self):
if self.position == 0:
return self.name
return f"{self.position}:{self.name}"
128 changes: 128 additions & 0 deletions machine/corpora/scripture_ref.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,128 @@
from __future__ import annotations

from functools import total_ordering
from typing import List, Optional

from ..scripture.constants import ENGLISH_VERSIFICATION
from ..scripture.verse_ref import VerseRef, Versification, are_overlapping_verse_ranges
from ..utils.comparable import Comparable
from .scripture_element import ScriptureElement


@total_ordering
class ScriptureRef(Comparable):
def __init__(self, ref: Optional[VerseRef] = None, path: Optional[List[ScriptureElement]] = None) -> None:
self._verse_ref: VerseRef = ref if ref is not None else VerseRef()
self._path: List[ScriptureElement] = path if path is not None else []

_empty: Optional[ScriptureRef] = None

@classmethod
def parse(cls, selection: str, versification: Optional[Versification] = None) -> ScriptureRef:
parts: List[str] = selection.split("/")
if len(parts) == 1:
return cls(
VerseRef.from_string(parts[0], versification if versification is not None else ENGLISH_VERSIFICATION)
)
vref: str = parts[0]
path: List[ScriptureElement] = []
for part in parts[1:]:
elem: List[str] = part.split(":")
if len(elem) == 1:
path.append(ScriptureElement(0, elem[0]))
else:
path.append(ScriptureElement(int(elem[0]), elem[1]))

return cls(
VerseRef.from_string(vref, versification if versification is not None else ENGLISH_VERSIFICATION), path
)

@property
def verse_ref(self) -> VerseRef:
return self._verse_ref

@property
def path(self) -> List[ScriptureElement]:
return self._path

@property
def book_num(self) -> int:
return self.verse_ref.book_num

@property
def chapter_num(self) -> int:
return self.verse_ref.chapter_num

@property
def verse_num(self) -> int:
return self.verse_ref.verse_num

@property
def book(self) -> str:
return self.verse_ref.book

@property
def chapter(self) -> str:
return self.verse_ref.chapter

@property
def verse(self) -> str:
return self.verse_ref.verse

@property
def versification(self) -> Versification:
return self.verse_ref.versification

@property
def is_empty(self) -> bool:
return self.verse_ref.is_default

@property
def is_verse(self) -> bool:
return VerseRef.verse_num != 0 and len(self.path) == 0

def change_versification(self, versification: Versification) -> ScriptureRef:
vr: VerseRef = self.verse_ref.copy()
vr.change_versification(versification)
return ScriptureRef(vr, self.path)

def overlaps(self, other: ScriptureRef) -> bool:
if not are_overlapping_verse_ranges(self.verse_ref, other.verse_ref):
return False
return self.path == other.path

def compare_to(self, other: object, compare_segments: bool = True, strict: bool = True):
if not isinstance(other, ScriptureRef):
raise TypeError("other is not a ScriptureRef object.")
if self is other:
return 0

res = self.verse_ref.compare_to(other.verse_ref, compare_segments=compare_segments)
if res != 0:
return res

for se1, se2 in zip(self.path, other.path):
res = se1.compare_to(se2, strict=strict)
if res != 0:
return res

return len(self.path) - len(other.path)

def __eq__(self, other: object) -> bool:
if not isinstance(other, ScriptureRef):
return NotImplemented
return self.verse_ref == other.verse_ref and self.path == other.path

def __lt__(self, other: object) -> bool:
if not isinstance(other, ScriptureRef):
return NotImplemented
return self.compare_to(other) < 0

def __hash__(self) -> int:
return hash((self.verse_ref, tuple(self.path)))

def __repr__(self) -> str:
return f"{self.verse_ref}/{'/'.join(str(se) for se in self.path)}"


EMPTY_SCRIPTURE_REF = ScriptureRef()
Loading

0 comments on commit 953f203

Please sign in to comment.