Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Port USFM code from Machine up to commit a9058ce #111

Merged
merged 6 commits into from
Aug 15, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 17 additions & 1 deletion machine/corpora/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,15 @@
from .parallel_text_row import ParallelTextRow
from .paratext_backup_text_corpus import ParatextBackupTextCorpus
from .paratext_text_corpus import ParatextTextCorpus
from .scripture_text_corpus import ScriptureTextCorpus, create_versification_ref_corpus, extract_scripture_corpus
from .scripture_element import ScriptureElement
from .scripture_ref import EMPTY_SCRIPTURE_REF, ScriptureRef
from .scripture_ref_usfm_parser_handler import ScriptureRefUsfmParserHandler, ScriptureTextType
from .scripture_text_corpus import (
ScriptureTextCorpus,
create_versification_ref_corpus,
extract_scripture_corpus,
is_scripture,
)
from .standard_parallel_text_corpus import StandardParallelTextCorpus
from .text import Text
from .text_corpus import TextCorpus
Expand All @@ -41,6 +49,7 @@
from .usfm_parser_state import UsfmElementType, UsfmParserElement, UsfmParserState
from .usfm_stylesheet import UsfmStylesheet
from .usfm_tag import UsfmJustification, UsfmStyleAttribute, UsfmStyleType, UsfmTag, UsfmTextProperties, UsfmTextType
from .usfm_text_updater import UsfmTextUpdater
from .usfm_token import UsfmAttribute, UsfmToken, UsfmTokenType
from .usfm_tokenizer import RtlReferenceOrder, UsfmTokenizer
from .usx_file_alignment_collection import UsxFileAlignmentCollection
Expand All @@ -60,9 +69,11 @@
"DblBundleTextCorpus",
"DictionaryAlignmentCorpus",
"DictionaryTextCorpus",
"EMPTY_SCRIPTURE_REF",
"escape_spaces",
"extract_scripture_corpus",
"flatten",
"is_scripture",
"lowercase",
"MemoryAlignmentCollection",
"MemoryText",
Expand All @@ -78,7 +89,11 @@
"ParatextTextCorpus",
"parse_usfm",
"RtlReferenceOrder",
"ScriptureElement",
"ScriptureRef",
"ScriptureRefUsfmParserHandler",
"ScriptureTextCorpus",
"ScriptureTextType",
"StandardParallelTextCorpus",
"Text",
"TextCorpus",
Expand All @@ -104,6 +119,7 @@
"UsfmTag",
"UsfmTextProperties",
"UsfmTextType",
"UsfmTextUpdater",
"UsfmToken",
"UsfmTokenizer",
"UsfmTokenType",
Expand Down
10 changes: 10 additions & 0 deletions machine/corpora/dictionary_text_corpus.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from typing import Iterable, Optional, overload

from ..scripture.verse_ref import Versification
from .text import Text
from .text_corpus import TextCorpus

Expand All @@ -21,6 +22,7 @@ def __init__(self, *args, **kwargs) -> None:
texts = args[0]
self._texts = {t.id: t for t in texts}
self._is_tokenized = False
self._versification = None

@property
def texts(self) -> Iterable[Text]:
Expand All @@ -34,6 +36,14 @@ def is_tokenized(self) -> bool:
def is_tokenized(self, value: bool) -> None:
self._is_tokenized = value

@property
def versification(self) -> Optional[Versification]:
return self._versification

@versification.setter
def versification(self, value: Versification) -> None:
self._versification = value

def __getitem__(self, id: str) -> Optional[Text]:
return self._texts.get(id)

Expand Down
5 changes: 5 additions & 0 deletions machine/corpora/flatten.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from itertools import chain
from typing import Generator, Iterable, List, Optional, cast, overload

from ..scripture.verse_ref import Versification
from .alignment_collection import AlignmentCollection
from .alignment_corpus import AlignmentCorpus
from .alignment_row import AlignmentRow
Expand Down Expand Up @@ -54,6 +55,10 @@ def texts(self) -> Iterable[Text]:
def is_tokenized(self) -> bool:
return all(c.is_tokenized for c in self._corpora)

@property
def versification(self) -> Optional[Versification]:
return self._corpora[0].versification if len(self._corpora) > 0 else None

def count(self, include_empty: bool = True) -> int:
return sum(c.count(include_empty) for c in self._corpora)

Expand Down
2 changes: 2 additions & 0 deletions machine/corpora/parallel_text_row.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,8 @@ def __init__(
source_flags: TextRowFlags = TextRowFlags.SENTENCE_START,
target_flags: TextRowFlags = TextRowFlags.SENTENCE_START,
) -> None:
if not text_id:
raise ValueError("A text_id must be set.")
if len(source_refs) == 0 and len(target_refs) == 0:
raise ValueError("Either a source or target ref must be set.")
self._text_id = text_id
Expand Down
3 changes: 2 additions & 1 deletion machine/corpora/paratext_backup_text_corpus.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@


class ParatextBackupTextCorpus(ScriptureTextCorpus):
def __init__(self, filename: StrPath, include_markers: bool = False) -> None:
def __init__(self, filename: StrPath, include_markers: bool = False, include_all_text: bool = False) -> None:
with ZipFile(filename, "r") as archive:
parser = ZipParatextProjectSettingsParser(archive)
settings = parser.parse()
Expand All @@ -28,6 +28,7 @@ def __init__(self, filename: StrPath, include_markers: bool = False) -> None:
sfm_entry.filename,
versification,
include_markers,
include_all_text,
)
)

Expand Down
12 changes: 10 additions & 2 deletions machine/corpora/paratext_project_settings_parser_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,8 +73,16 @@ def parse(self) -> ParatextProjectSettings:
post_part = naming_elem.get("PostPart")
if post_part:
suffix = post_part
biblical_terms = settings_tree.getroot().findtext("BiblicalTermsListSetting", "")
parts = biblical_terms.split(":", 2)
biblical_terms_list_setting = settings_tree.getroot().findtext("BiblicalTermsListSetting", "")
if biblical_terms_list_setting is None:
# Default to Major::BiblicalTerms.xml to mirror Paratext behavior
biblical_terms_list_setting = "Major::BiblicalTerms.xml"
parts = biblical_terms_list_setting.split(":", 2)
if len(parts) != 3:
raise ValueError(
f"The BiblicalTermsListSetting element in Settings.xml in project {full_name}"
f" is not in the expected format (i.e., Major::BiblicalTerms.xml) but is {biblical_terms_list_setting}."
)

return ParatextProjectSettings(
name, full_name, encoding, versification, stylesheet, prefix, form, suffix, parts[0], parts[1], parts[2]
Expand Down
11 changes: 9 additions & 2 deletions machine/corpora/paratext_text_corpus.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@


class ParatextTextCorpus(ScriptureTextCorpus):
def __init__(self, project_dir: StrPath, include_markers: bool = False) -> None:
def __init__(self, project_dir: StrPath, include_markers: bool = False, include_all_text: bool = False) -> None:
parser = FileParatextProjectSettingsParser(project_dir)
settings = parser.parse()

Expand All @@ -17,7 +17,14 @@ def __init__(self, project_dir: StrPath, include_markers: bool = False) -> None:
texts: List[UsfmFileText] = []
for sfm_filename in Path(project_dir).glob(f"{settings.file_name_prefix}*{settings.file_name_suffix}"):
texts.append(
UsfmFileText(settings.stylesheet, settings.encoding, sfm_filename, versification, include_markers)
UsfmFileText(
settings.stylesheet,
settings.encoding,
sfm_filename,
versification,
include_markers,
include_all_text,
)
)

super().__init__(versification, texts)
54 changes: 54 additions & 0 deletions machine/corpora/scripture_element.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
from __future__ import annotations

from functools import total_ordering
from typing import Optional

from ..utils.comparable import Comparable


@total_ordering
class ScriptureElement(Comparable):
def __init__(self, position: int, name: str) -> None:
self._position = position
self._name = name

@property
def position(self) -> int:
return self._position

@property
def name(self) -> str:
return self._name

def compare_to(self, other: object, strict: Optional[bool] = True) -> int:
if not isinstance(other, ScriptureElement):
raise (TypeError("other is not a ScriptureElement object."))
if self is other:
return 0

if strict:
res = self.position - other.position
if res != 0:
return res

return (self.name > other.name) - (self.name < other.name)

def __eq__(self, other: ScriptureElement) -> bool:
if not isinstance(other, ScriptureElement):
return NotImplemented

return self.position == other.position and self.name == other.name

def __lt__(self, other: ScriptureElement) -> bool:
if not isinstance(other, ScriptureElement):
return NotImplemented

return self.compare_to(other) < 0

def __hash__(self) -> int:
return hash((self.position, self.name))

def __repr__(self):
if self.position == 0:
return self.name
return f"{self.position}:{self.name}"
128 changes: 128 additions & 0 deletions machine/corpora/scripture_ref.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,128 @@
from __future__ import annotations

from functools import total_ordering
from typing import List, Optional

from ..scripture.constants import ENGLISH_VERSIFICATION
from ..scripture.verse_ref import VerseRef, Versification, are_overlapping_verse_ranges
from ..utils.comparable import Comparable
from .scripture_element import ScriptureElement


@total_ordering
class ScriptureRef(Comparable):
def __init__(self, ref: Optional[VerseRef] = None, path: Optional[List[ScriptureElement]] = None) -> None:
self._verse_ref: VerseRef = ref if ref is not None else VerseRef()
self._path: List[ScriptureElement] = path if path is not None else []

_empty: Optional[ScriptureRef] = None

@classmethod
def parse(cls, selection: str, versification: Optional[Versification] = None) -> ScriptureRef:
parts: List[str] = selection.split("/")
if len(parts) == 1:
return cls(
VerseRef.from_string(parts[0], versification if versification is not None else ENGLISH_VERSIFICATION)
)
vref: str = parts[0]
path: List[ScriptureElement] = []
for part in parts[1:]:
elem: List[str] = part.split(":")
if len(elem) == 1:
path.append(ScriptureElement(0, elem[0]))
else:
path.append(ScriptureElement(int(elem[0]), elem[1]))

return cls(
VerseRef.from_string(vref, versification if versification is not None else ENGLISH_VERSIFICATION), path
)

@property
def verse_ref(self) -> VerseRef:
return self._verse_ref

@property
def path(self) -> List[ScriptureElement]:
return self._path

@property
def book_num(self) -> int:
return self.verse_ref.book_num

@property
def chapter_num(self) -> int:
return self.verse_ref.chapter_num

@property
def verse_num(self) -> int:
return self.verse_ref.verse_num

@property
def book(self) -> str:
return self.verse_ref.book

@property
def chapter(self) -> str:
return self.verse_ref.chapter

@property
def verse(self) -> str:
return self.verse_ref.verse

@property
def versification(self) -> Versification:
return self.verse_ref.versification

@property
def is_empty(self) -> bool:
return self.verse_ref.is_default

@property
def is_verse(self) -> bool:
return VerseRef.verse_num != 0 and len(self.path) == 0

def change_versification(self, versification: Versification) -> ScriptureRef:
vr: VerseRef = self.verse_ref.copy()
vr.change_versification(versification)
return ScriptureRef(vr, self.path)

def overlaps(self, other: ScriptureRef) -> bool:
if not are_overlapping_verse_ranges(self.verse_ref, other.verse_ref):
return False
return self.path == other.path

def compare_to(self, other: object, compare_segments: bool = True, strict: bool = True):
if not isinstance(other, ScriptureRef):
raise TypeError("other is not a ScriptureRef object.")
if self is other:
return 0

res = self.verse_ref.compare_to(other.verse_ref, compare_segments=compare_segments)
if res != 0:
return res

for se1, se2 in zip(self.path, other.path):
res = se1.compare_to(se2, strict=strict)
if res != 0:
return res

return len(self.path) - len(other.path)

def __eq__(self, other: object) -> bool:
if not isinstance(other, ScriptureRef):
return NotImplemented
return self.verse_ref == other.verse_ref and self.path == other.path

def __lt__(self, other: object) -> bool:
if not isinstance(other, ScriptureRef):
return NotImplemented
return self.compare_to(other) < 0

def __hash__(self) -> int:
return hash((self.verse_ref, tuple(self.path)))

def __repr__(self) -> str:
return f"{self.verse_ref}/{'/'.join(str(se) for se in self.path)}"


EMPTY_SCRIPTURE_REF = ScriptureRef()
Loading
Loading