Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Port USFM code from Machine up to commit a9058ce #111

Merged
merged 6 commits into from
Aug 15, 2024
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions machine/corpora/dictionary_text_corpus.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from typing import Iterable, Optional, overload

from ..scripture.verse_ref import Versification
from .text import Text
from .text_corpus import TextCorpus

Expand All @@ -21,6 +22,7 @@ def __init__(self, *args, **kwargs) -> None:
texts = args[0]
self._texts = {t.id: t for t in texts}
self._is_tokenized = False
self._versification = None

@property
def texts(self) -> Iterable[Text]:
Expand All @@ -34,6 +36,14 @@ def is_tokenized(self) -> bool:
def is_tokenized(self, value: bool) -> None:
self._is_tokenized = value

@property
def versification(self) -> Optional[Versification]:
return self._versification

@versification.setter
def versification(self, value: Versification) -> None:
self._versification = value

def __getitem__(self, id: str) -> Optional[Text]:
return self._texts.get(id)

Expand Down
5 changes: 5 additions & 0 deletions machine/corpora/flatten.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from itertools import chain
from typing import Generator, Iterable, List, Optional, cast, overload

from ..scripture.verse_ref import Versification
from .alignment_collection import AlignmentCollection
from .alignment_corpus import AlignmentCorpus
from .alignment_row import AlignmentRow
Expand Down Expand Up @@ -54,6 +55,10 @@ def texts(self) -> Iterable[Text]:
def is_tokenized(self) -> bool:
return all(c.is_tokenized for c in self._corpora)

@property
def versification(self) -> Optional[Versification]:
return self._corpora[0].versification if len(self._corpora) > 0 else None

def count(self, include_empty: bool = True) -> int:
return sum(c.count(include_empty) for c in self._corpora)

Expand Down
2 changes: 2 additions & 0 deletions machine/corpora/parallel_text_row.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,8 @@ def __init__(
source_flags: TextRowFlags = TextRowFlags.SENTENCE_START,
target_flags: TextRowFlags = TextRowFlags.SENTENCE_START,
) -> None:
if not text_id:
raise ValueError("A text_id must be set.")
if len(source_refs) == 0 and len(target_refs) == 0:
raise ValueError("Either a source or target ref must be set.")
self._text_id = text_id
Expand Down
3 changes: 2 additions & 1 deletion machine/corpora/paratext_backup_text_corpus.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@


class ParatextBackupTextCorpus(ScriptureTextCorpus):
def __init__(self, filename: StrPath, include_markers: bool = False) -> None:
def __init__(self, filename: StrPath, include_markers: bool = False, include_all_text: bool = False) -> None:
with ZipFile(filename, "r") as archive:
parser = ZipParatextProjectSettingsParser(archive)
settings = parser.parse()
Expand All @@ -28,6 +28,7 @@ def __init__(self, filename: StrPath, include_markers: bool = False) -> None:
sfm_entry.filename,
versification,
include_markers,
include_all_text,
)
)

Expand Down
12 changes: 10 additions & 2 deletions machine/corpora/paratext_project_settings_parser_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,8 +73,16 @@ def parse(self) -> ParatextProjectSettings:
post_part = naming_elem.get("PostPart")
if post_part:
suffix = post_part
biblical_terms = settings_tree.getroot().findtext("BiblicalTermsListSetting", "")
parts = biblical_terms.split(":", 2)
biblical_terms_list_setting = settings_tree.getroot().findtext("BiblicalTermsListSetting", "")
if biblical_terms_list_setting is None:
# Default to Major::BiblicalTerms.xml to mirror Paratext behavior
biblical_terms_list_setting = "Major::BiblicalTerms.xml"
parts = biblical_terms_list_setting.split(":", 2)
if len(parts) != 3:
raise ValueError(
f"The BiblicalTermsListSetting element in Settings.xml in project {full_name}"
f" is not in the expected format (i.e., Major::BiblicalTerms.xml) but is {biblical_terms_list_setting}."
)

return ParatextProjectSettings(
name, full_name, encoding, versification, stylesheet, prefix, form, suffix, parts[0], parts[1], parts[2]
Expand Down
11 changes: 9 additions & 2 deletions machine/corpora/paratext_text_corpus.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@


class ParatextTextCorpus(ScriptureTextCorpus):
def __init__(self, project_dir: StrPath, include_markers: bool = False) -> None:
def __init__(self, project_dir: StrPath, include_markers: bool = False, include_all_text: bool = False) -> None:
parser = FileParatextProjectSettingsParser(project_dir)
settings = parser.parse()

Expand All @@ -17,7 +17,14 @@ def __init__(self, project_dir: StrPath, include_markers: bool = False) -> None:
texts: List[UsfmFileText] = []
for sfm_filename in Path(project_dir).glob(f"{settings.file_name_prefix}*{settings.file_name_suffix}"):
texts.append(
UsfmFileText(settings.stylesheet, settings.encoding, sfm_filename, versification, include_markers)
UsfmFileText(
settings.stylesheet,
settings.encoding,
sfm_filename,
versification,
include_markers,
include_all_text,
)
)

super().__init__(versification, texts)
174 changes: 174 additions & 0 deletions machine/corpora/scripture_ref_usfm_parser_handler.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,174 @@
from abc import ABC
from enum import Enum, auto
from typing import List, Optional, Sequence

from ..scripture.scripture_element import ScriptureElement
from ..scripture.scripture_ref import ScriptureRef
from ..scripture.verse_ref import VerseRef, are_overlapping_verse_ranges
from .corpora_utils import merge_verse_ranges
from .usfm_parser_handler import UsfmParserHandler
from .usfm_parser_state import UsfmParserState
from .usfm_token import UsfmAttribute


class ScriptureTextType(Enum):
NONVERSE = auto()
VERSE = auto()
NOTE = auto()


class ScriptureRefUsfmParserHandler(UsfmParserHandler, ABC):
def __init__(self) -> None:
self._cur_verse_ref: VerseRef = VerseRef()
self._cur_elements_stack: List[ScriptureElement] = []
self._cur_text_type_stack: List[ScriptureTextType] = []
self._duplicate_verse: bool = False

@property
def _current_text_type(self) -> ScriptureTextType:
return ScriptureTextType.NONVERSE if len(self._cur_text_type_stack) == 0 else self._cur_text_type_stack[-1]

def end_usfm(self, state: UsfmParserState) -> None:
self._end_verse_text_wrapper(state)

def chapter(self, state: UsfmParserState, number: str, marker: str, alt_number: str, pub_number: str) -> None:
self._end_verse_text_wrapper(state)
self._update_verse_ref(state.verse_ref, marker)

def verse(
self, state: UsfmParserState, number: str, marker: str, alt_number: Optional[str], pub_number: Optional[str]
) -> None:
if state.verse_ref == self._cur_verse_ref:
self._end_verse_text_wrapper(state)
# ignore duplicate verses
self._duplicate_verse = True
elif are_overlapping_verse_ranges(number, self._cur_verse_ref.verse):
# merge overlapping verse ranges in to one range
verse_ref: VerseRef = self._cur_verse_ref.copy()
verse_ref.verse = merge_verse_ranges(number, self._cur_verse_ref.verse)
self._update_verse_ref(verse_ref, marker)
else:
self._end_verse_text_wrapper(state)
self._update_verse_ref(state.verse_ref, marker)
self._start_verse_text_wrapper(state)

def start_para(
self,
state: UsfmParserState,
marker: str,
unknown: Optional[bool],
attributes: Optional[Sequence[UsfmAttribute]],
) -> None:
if self._cur_verse_ref.is_default:
self._update_verse_ref(state.verse_ref, marker)
if not state.is_verse_text:
self._start_parent_element(marker)
self._start_non_verse_text_wrapper(state)

def end_para(self, state: UsfmParserState, marker: str) -> None:
if self._current_text_type == ScriptureTextType.NONVERSE:
self._end_parent_element()
self._end_non_verse_text_wrapper(state)

def start_row(self, state: UsfmParserState, marker: str) -> None:
if self._current_text_type == ScriptureTextType.NONVERSE:
self._start_parent_element(marker)

def end_row(self, state: UsfmParserState, marker: str) -> None:
if self._current_text_type == ScriptureTextType.NONVERSE:
self._end_parent_element()

def start_cell(self, state: UsfmParserState, marker: str, align: str, colspan: int) -> None:
if self._current_text_type == ScriptureTextType.NONVERSE:
self._start_parent_element(marker)
self._start_non_verse_text_wrapper(state)

def end_cell(self, state: UsfmParserState, marker: str) -> None:
if self._current_text_type == ScriptureTextType.NONVERSE:
self._end_parent_element()
self._end_non_verse_text_wrapper(state)

def start_sidebar(self, state: UsfmParserState, marker: str, category: str) -> None:
self._start_parent_element(marker)

def end_sidebar(self, state: UsfmParserState, marker: str, closed: bool) -> None:
self._end_parent_element()

def start_note(self, state: UsfmParserState, marker: str, caller: str, category: Optional[str]) -> None:
self._next_element(marker)
self._start_note_text_wrapper(state)

def end_note(self, state: UsfmParserState, marker: str, closed: bool) -> None:
self._end_note_text_wrapper(state)

def ref(self, state: UsfmParserState, marker: str, display: str, target: str) -> None: ...

def _start_verse_text(self, state: UsfmParserState, scripture_refs: Optional[List[ScriptureRef]]) -> None: ...

def _end_verse_text(self, state: UsfmParserState, scripture_refs: List[ScriptureRef]) -> None: ...

def _start_non_verse_text(self, state: UsfmParserState, scripture_ref: ScriptureRef) -> None: ...

def _end_non_verse_text(self, state: UsfmParserState, scripture_ref: ScriptureRef) -> None: ...

def _start_note_text(self, state: UsfmParserState, scripture_ref: ScriptureRef) -> None: ...

def _end_note_text(self, state: UsfmParserState, scripture_ref: ScriptureRef) -> None: ...

def _start_verse_text_wrapper(self, state: UsfmParserState) -> None:
self._duplicate_verse = False
self._cur_text_type_stack.append(ScriptureTextType.VERSE)
self._start_verse_text(state, self._create_verse_refs())

def _end_verse_text_wrapper(self, state: UsfmParserState) -> None:
if not self._duplicate_verse and self._cur_verse_ref.verse_num != 0:
self._end_verse_text(state, self._create_verse_refs())
self._cur_text_type_stack.pop()

def _start_non_verse_text_wrapper(self, state: UsfmParserState) -> None:
self._cur_text_type_stack.append(ScriptureTextType.NONVERSE)
self._start_non_verse_text(state, self._create_non_verse_ref())

def _end_non_verse_text_wrapper(self, state: UsfmParserState) -> None:
self._end_non_verse_text(state, self._create_non_verse_ref())
self._cur_text_type_stack.pop()

def _start_note_text_wrapper(self, state: UsfmParserState) -> None:
self._cur_text_type_stack.append(ScriptureTextType.NOTE)
self._start_note_text(state, self._create_non_verse_ref())

def _end_note_text_wrapper(self, state: UsfmParserState) -> None:
self._end_note_text(state, self._create_non_verse_ref())
self._cur_text_type_stack.pop()

def _update_verse_ref(self, verse_ref: VerseRef, marker: str) -> None:
if not are_overlapping_verse_ranges(verse_ref, self._cur_verse_ref):
self._cur_elements_stack.clear()
self._cur_elements_stack.append(ScriptureElement(0, marker))
self._cur_verse_ref = verse_ref.copy()

def _next_element(self, marker: str) -> None:
prev_elem: ScriptureElement = self._cur_elements_stack.pop()
self._cur_elements_stack.append(ScriptureElement(prev_elem.position + 1, marker))

def _start_parent_element(self, marker: str) -> None:
self._next_element(marker)
self._cur_elements_stack.append(ScriptureElement(0, marker))

def _end_parent_element(self) -> None:
self._cur_elements_stack.pop()

def _create_verse_refs(self) -> List[ScriptureRef]:
return (
[ScriptureRef(v) for v in self._cur_verse_ref.all_verses()]
if self._cur_verse_ref.has_multiple
else [ScriptureRef(self._cur_verse_ref)]
)

def _create_non_verse_ref(self) -> ScriptureRef:
verse_ref = (
list(self._cur_verse_ref.all_verses())[-1] if self._cur_verse_ref.has_multiple else self._cur_verse_ref
)
# No need to reverse unlike in Machine, elements are already added in correct order
path = [e for e in self._cur_elements_stack if e.position > 0]
return ScriptureRef(verse_ref, path)
66 changes: 57 additions & 9 deletions machine/corpora/scripture_text.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from typing import Generator, List, Optional
from typing import Generator, List, Optional, Union

from ..scripture import ENGLISH_VERSIFICATION
from ..scripture.scripture_ref import ScriptureElement, ScriptureRef
from ..scripture.verse_ref import VerseRef, Versification
from ..utils.context_managed_generator import ContextManagedGenerator
from .corpora_utils import gen, get_scripture_text_sort_key
Expand All @@ -20,19 +21,46 @@ def versification(self) -> Versification:
def get_rows(self) -> ContextManagedGenerator[TextRow, None, None]:
seg_list: List[TextRow] = []
out_of_order = False
prev_verse_ref = VerseRef()
prev_scr_ref = ScriptureRef()
with super().get_rows() as rows:
for row in rows:
verse_ref: VerseRef = row.ref
scr_ref: ScriptureRef = row.ref
seg_list.append(row)
if not out_of_order and verse_ref < prev_verse_ref:
if not out_of_order and scr_ref < prev_scr_ref:
out_of_order = True
prev_verse_ref = verse_ref
prev_scr_ref = scr_ref
if out_of_order:
seg_list.sort(key=lambda r: r.ref)
return ContextManagedGenerator(gen(seg_list))

def _create_rows(
self, ref: Union[List[ScriptureRef], VerseRef], text: str = "", is_sentence_start: bool = True
) -> Generator[TextRow, None, None]:
if isinstance(ref, VerseRef):
yield from self._create_rows_verse_ref(ref, text, is_sentence_start)
else:
yield from self._create_rows_scripture_ref(ref, text, is_sentence_start)

def _create_rows_scripture_ref(
self, scripture_refs: List[ScriptureRef], text: str = "", is_sentence_start: bool = True
) -> Generator[TextRow, None, None]:
if len(scripture_refs) > 1:
first_verse = True
for sref in scripture_refs:
if first_verse:
flags: TextRowFlags = TextRowFlags.IN_RANGE | TextRowFlags.RANGE_START
if is_sentence_start:
flags |= TextRowFlags.SENTENCE_START
yield super()._create_row(text, sref, flags)
first_verse = False
else:
yield self._create_empty_row(sref, TextRowFlags.IN_RANGE)
else:
yield super()._create_row(
text, scripture_refs[0], TextRowFlags.SENTENCE_START if is_sentence_start else TextRowFlags.NONE
)

def _create_rows_verse_ref(
self, verse_ref: VerseRef, text: str = "", is_sentence_start: bool = True
) -> Generator[TextRow, None, None]:
if verse_ref.has_multiple:
Expand All @@ -42,13 +70,33 @@ def _create_rows(
flags = TextRowFlags.IN_RANGE | TextRowFlags.RANGE_START
if is_sentence_start:
flags |= TextRowFlags.SENTENCE_START
yield self._create_row(text, vref, flags)
yield super()._create_row(text, ScriptureRef(vref), flags)
first_verse = False
else:
yield self._create_empty_row(vref, TextRowFlags.IN_RANGE)
yield self._create_empty_row(ScriptureRef(vref), TextRowFlags.IN_RANGE)
else:
yield super()._create_row(
text, ScriptureRef(verse_ref), TextRowFlags.SENTENCE_START if is_sentence_start else TextRowFlags.NONE
)

def _create_row(
self,
ref: Union[ScriptureRef, VerseRef],
text: str,
is_sentence_start: bool,
elements: Optional[List[ScriptureElement]] = None,
) -> TextRow:
if isinstance(ref, VerseRef):
return super()._create_row(
text,
ScriptureRef(ref, elements),
TextRowFlags.SENTENCE_START if is_sentence_start else TextRowFlags.NONE,
)
else:
yield self._create_row(
text, verse_ref, TextRowFlags.SENTENCE_START if is_sentence_start else TextRowFlags.NONE
return super()._create_row(
text,
ref,
TextRowFlags.SENTENCE_START if is_sentence_start else TextRowFlags.NONE,
)

def _create_verse_ref(self, chapter: str, verse: str) -> VerseRef:
Expand Down
Loading
Loading