Skip to content

Commit

Permalink
fix verse numbers missing text - sillsdev/serval#411
Browse files Browse the repository at this point in the history
auto-add missing verses - sillsdev/serval#414
  • Loading branch information
johnml1135 committed Jun 19, 2024
1 parent 4e88953 commit 1ba0556
Show file tree
Hide file tree
Showing 7 changed files with 115 additions and 11 deletions.
6 changes: 5 additions & 1 deletion src/SIL.Machine/Corpora/ScriptureRefUsfmParserHandlerBase.cs
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,10 @@ public enum ScriptureTextType
public abstract class ScriptureRefUsfmParserHandlerBase : UsfmParserHandlerBase
{
private VerseRef _curVerseRef;
protected VerseRef CurVerseRef
{
get { return _curVerseRef; }
}
private readonly Stack<ScriptureElement> _curElements;
private readonly Stack<ScriptureTextType> _curTextType;
private bool _duplicateVerse = false;
Expand Down Expand Up @@ -235,7 +239,7 @@ private void EndNoteText(UsfmParserState state)
_curTextType.Pop();
}

private void UpdateVerseRef(VerseRef verseRef, string marker)
protected void UpdateVerseRef(VerseRef verseRef, string marker)
{
if (!VerseRef.AreOverlappingVersesRanges(verseRef, _curVerseRef))
{
Expand Down
1 change: 0 additions & 1 deletion src/SIL.Machine/Corpora/UsfmParser.cs
Original file line number Diff line number Diff line change
Expand Up @@ -629,7 +629,6 @@ private void CloseElement(bool closed = false)
switch (element.Type)
{
case UsfmElementType.Book:
Handler?.EndBook(State, element.Marker);
break;
case UsfmElementType.Para:
Handler?.EndPara(State, element.Marker);
Expand Down
57 changes: 54 additions & 3 deletions src/SIL.Machine/Corpora/UsfmTextUpdater.cs
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@ public UsfmTextUpdater(
public override void EndUsfm(UsfmParserState state)
{
CollectTokens(state);
InsertRemainingRows();
base.EndUsfm(state);
}

Expand Down Expand Up @@ -328,21 +329,71 @@ private IReadOnlyList<string> AdvanceRows(IReadOnlyList<ScriptureRef> segScrRefs
if (compare == 0)
{
// source and row match
// grab the text and increment both
// grab the text and increment row now (and source on the next loop)
rowTexts.Add(text);
sourceIndex++;
_rowIndex++;
break;
}
}
if (compare <= 0)
if (compare < 0)
{
// source is ahead row, increment row
InsertOrphanRow(preserveLastVerse: true);
_rowIndex++;
}
}
return rowTexts;
}

private void InsertOrphanRow(bool preserveLastVerse = false)
{
(IReadOnlyList<ScriptureRef> rowScrRefs, string text) = _rows[_rowIndex];

if (_tokens.Count == 0)
return;
if (!rowScrRefs[0].IsVerse)
return;
if (rowScrRefs[0].BookNum != CurVerseRef.BookNum)
return;

var lastToken = _tokens.Last();
if (preserveLastVerse)
_tokens.RemoveAt(_tokens.Count - 1);

if (rowScrRefs[0].ChapterNum != CurVerseRef.ChapterNum)
_tokens.Add(new UsfmToken(UsfmTokenType.Chapter, "c", null, null, rowScrRefs[0].Chapter));
if (rowScrRefs.Count > 1)
{
_tokens.Add(
new UsfmToken(
UsfmTokenType.Verse,
"v",
null,
null,
$"{rowScrRefs[0].Verse}-{rowScrRefs.Last().Verse}"
)
);
}
else
{
_tokens.Add(new UsfmToken(UsfmTokenType.Verse, "v", null, null, rowScrRefs[0].Verse));
}
_tokens.Add(new UsfmToken(UsfmTokenType.Text, null, text, null));
if (preserveLastVerse)
_tokens.Add(lastToken);

UpdateVerseRef(rowScrRefs.Last().VerseRef, "v");
}

private void InsertRemainingRows()
{
while (_rowIndex < _rows.Count)
{
InsertOrphanRow();
_rowIndex++;
}
}

private void CollectTokens(UsfmParserState state)
{
_tokens.AddRange(_newTokens);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,9 @@
\v 10 \tc3-4 \qt-s |Jesus\*Chapter 2 verse 10\qt-e\*
\v 11-12
\restore restore information
\v 14
\v 17
\v 21
\c 3
\cl PSALM 3
\s1 Section 1
Expand Down
10 changes: 5 additions & 5 deletions tests/SIL.Machine.Tests/Corpora/UsfmFileTextTests.cs
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ public void GetRows_NonEmptyText()

IText text = corpus["MAT"];
TextRow[] rows = text.GetRows().ToArray();
Assert.That(rows, Has.Length.EqualTo(24));
Assert.That(rows, Has.Length.EqualTo(27));

Assert.That(rows[0].Ref, Is.EqualTo(ScriptureRef.Parse("MAT 1:1", corpus.Versification)));
Assert.That(rows[0].Text, Is.EqualTo("Chapter one, verse one."));
Expand Down Expand Up @@ -73,7 +73,7 @@ public void GetRows_NonEmptyText_AllText()

IText text = corpus["MAT"];
TextRow[] rows = text.GetRows().ToArray();
Assert.That(rows, Has.Length.EqualTo(50));
Assert.That(rows, Has.Length.EqualTo(53));

Assert.That(rows[0].Ref, Is.EqualTo(ScriptureRef.Parse("MAT 1:0/1:h", corpus.Versification)));
Assert.That(rows[0].Text, Is.EqualTo("Matthew"));
Expand Down Expand Up @@ -146,7 +146,7 @@ public void GetRows_SentenceStart()

IText text = corpus["MAT"];
TextRow[] rows = text.GetRows().ToArray();
Assert.That(rows, Has.Length.EqualTo(24));
Assert.That(rows, Has.Length.EqualTo(27));

Assert.That(rows[3].Ref, Is.EqualTo(ScriptureRef.Parse("MAT 1:4", corpus.Versification)));
Assert.That(rows[3].Text, Is.EqualTo("Chapter one, verse four,"));
Expand Down Expand Up @@ -179,7 +179,7 @@ public void GetRows_IncludeMarkers()

IText text = corpus["MAT"];
TextRow[] rows = text.GetRows().ToArray();
Assert.That(rows, Has.Length.EqualTo(24));
Assert.That(rows, Has.Length.EqualTo(27));

Assert.That(rows[0].Ref, Is.EqualTo(ScriptureRef.Parse("MAT 1:1", corpus.Versification)));
Assert.That(
Expand Down Expand Up @@ -254,7 +254,7 @@ public void GetRows_IncludeMarkers_AllText()

IText text = corpus["MAT"];
TextRow[] rows = text.GetRows().ToArray();
Assert.That(rows, Has.Length.EqualTo(46));
Assert.That(rows, Has.Length.EqualTo(49));

Assert.That(rows[2].Ref, Is.EqualTo(ScriptureRef.Parse("MAT 1:0/3:ip", corpus.Versification)));
Assert.That(rows[2].Text, Is.EqualTo("An introduction to Matthew\\fe + \\ft This is an endnote.\\fe*"));
Expand Down
47 changes: 47 additions & 0 deletions tests/SIL.Machine.Tests/Corpora/UsfmTextUpdaterTests.cs
Original file line number Diff line number Diff line change
Expand Up @@ -189,6 +189,20 @@ public void GetUsfm_Verse_RangeMultipleRowsSingleVerse()
);
}

[Test]
public void GetUsfm_Verse_2_2a_2b()
{
var rows = new List<(IReadOnlyList<ScriptureRef>, string)>
{
(ScrRef("MAT 2:2"), "Verse 2."),
(ScrRef("MAT 2:2a"), "Verse 2a."),
(ScrRef("MAT 2:2b"), "Verse 2b.")
};

string target = UpdateUsfm(rows);
Assert.That(target, Contains.Substring("\\v 2-3 Verse 2. Verse 2a. Verse 2b.\r\n"));
}

[Test]
public void GetUsfm_Verse_OptBreak()
{
Expand Down Expand Up @@ -375,6 +389,39 @@ public void GetUsfm_Verse_DoubleVaVp()
);
}

[Test]
public void GetUsfm_Verse_VersesNotInTemplate()
{
var rows = new List<(IReadOnlyList<ScriptureRef>, string)>
{
(ScrRef("MAT 2:11"), "Verse 11 1st part of range - include it."),
(ScrRef("MAT 2:12"), "Verse 12 2nd part of range - include it."),
(ScrRef("MAT 2:13"), "Verse 13 does not exist - include it."),
(ScrRef("MAT 2:14", "MAT 2:15"), "Verse 14 exists - include it."),
(ScrRef("MAT 2:16", "MAT 2:17"), "Verse 17 exists - include it."),
(ScrRef("MAT 2:18", "MAT 2:19"), "Verse 18-19 does not exist - include it."),
(ScrRef("MAT 2:20"), "Verse 20 does not exist - include it."),
(ScrRef("MAT 2:22/3:ip"), "This should not be included - only verse text."),
(ScrRef("MAT 5:3"), "Chapter 5?!?!? - include it.")
};

string target = UpdateUsfm(rows);
Assert.That(
target,
Contains.Substring(
"\\v 11-12 Verse 11 1st part of range - include it. Verse 12 2nd part of range - include it."
)
);
Assert.That(target, Contains.Substring("\\v 13 Verse 13 does not exist - include it."));
Assert.That(target, Contains.Substring("\\v 14 Verse 14 exists - include it."));
Assert.That(target, Contains.Substring("\\v 17 Verse 17 exists - include it."));
Assert.That(target, Contains.Substring("\\v 17 Verse 17 exists - include it."));
Assert.That(target, Contains.Substring("\\v 18-19 Verse 18-19 does not exist - include it."));
Assert.That(target, Contains.Substring("\\v 20 Verse 20 does not exist - include it."));
Assert.That(target, !Contains.Substring("This should not be included - only verse text."));
Assert.That(target, Contains.Substring("\\c 5\r\n\\v 3 Chapter 5?!?!? - include it."));
}

[Test]
public void GetUsfm_Verse_LastVerse()
{
Expand Down
2 changes: 1 addition & 1 deletion tests/SIL.Machine.Tests/Corpora/UsfmTokenizerTests.cs
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ public void Tokenize()
string usfm = ReadUsfm();
var tokenizer = new UsfmTokenizer();
IReadOnlyList<UsfmToken> tokens = tokenizer.Tokenize(usfm);
Assert.That(tokens, Has.Count.EqualTo(236));
Assert.That(tokens, Has.Count.EqualTo(239));

Assert.That(tokens[0].Type, Is.EqualTo(UsfmTokenType.Book));
Assert.That(tokens[0].Marker, Is.EqualTo("id"));
Expand Down

0 comments on commit 1ba0556

Please sign in to comment.