diff --git a/src/SIL.Machine/Corpora/ScriptureRefUsfmParserHandlerBase.cs b/src/SIL.Machine/Corpora/ScriptureRefUsfmParserHandlerBase.cs index 661d7fdd..9f5fbfb5 100644 --- a/src/SIL.Machine/Corpora/ScriptureRefUsfmParserHandlerBase.cs +++ b/src/SIL.Machine/Corpora/ScriptureRefUsfmParserHandlerBase.cs @@ -15,6 +15,10 @@ public enum ScriptureTextType public abstract class ScriptureRefUsfmParserHandlerBase : UsfmParserHandlerBase { private VerseRef _curVerseRef; + protected VerseRef CurVerseRef + { + get { return _curVerseRef; } + } private readonly Stack _curElements; private readonly Stack _curTextType; private bool _duplicateVerse = false; @@ -235,7 +239,7 @@ private void EndNoteText(UsfmParserState state) _curTextType.Pop(); } - private void UpdateVerseRef(VerseRef verseRef, string marker) + protected void UpdateVerseRef(VerseRef verseRef, string marker) { if (!VerseRef.AreOverlappingVersesRanges(verseRef, _curVerseRef)) { diff --git a/src/SIL.Machine/Corpora/UsfmParser.cs b/src/SIL.Machine/Corpora/UsfmParser.cs index c17afb38..a77b822d 100644 --- a/src/SIL.Machine/Corpora/UsfmParser.cs +++ b/src/SIL.Machine/Corpora/UsfmParser.cs @@ -629,7 +629,6 @@ private void CloseElement(bool closed = false) switch (element.Type) { case UsfmElementType.Book: - Handler?.EndBook(State, element.Marker); break; case UsfmElementType.Para: Handler?.EndPara(State, element.Marker); diff --git a/src/SIL.Machine/Corpora/UsfmTextUpdater.cs b/src/SIL.Machine/Corpora/UsfmTextUpdater.cs index 785d61a1..759d915d 100644 --- a/src/SIL.Machine/Corpora/UsfmTextUpdater.cs +++ b/src/SIL.Machine/Corpora/UsfmTextUpdater.cs @@ -41,6 +41,7 @@ public UsfmTextUpdater( public override void EndUsfm(UsfmParserState state) { CollectTokens(state); + InsertRemainingRows(); base.EndUsfm(state); } @@ -328,21 +329,71 @@ private IReadOnlyList AdvanceRows(IReadOnlyList segScrRefs if (compare == 0) { // source and row match - // grab the text and increment both + // grab the text and increment row now (and source on the next loop) rowTexts.Add(text); - sourceIndex++; + _rowIndex++; break; } } - if (compare <= 0) + if (compare < 0) { // source is ahead row, increment row + InsertOrphanRow(preserveLastVerse: true); _rowIndex++; } } return rowTexts; } + private void InsertOrphanRow(bool preserveLastVerse = false) + { + (IReadOnlyList rowScrRefs, string text) = _rows[_rowIndex]; + + if (_tokens.Count == 0) + return; + if (!rowScrRefs[0].IsVerse) + return; + if (rowScrRefs[0].BookNum != CurVerseRef.BookNum) + return; + + var lastToken = _tokens.Last(); + if (preserveLastVerse) + _tokens.RemoveAt(_tokens.Count - 1); + + if (rowScrRefs[0].ChapterNum != CurVerseRef.ChapterNum) + _tokens.Add(new UsfmToken(UsfmTokenType.Chapter, "c", null, null, rowScrRefs[0].Chapter)); + if (rowScrRefs.Count > 1) + { + _tokens.Add( + new UsfmToken( + UsfmTokenType.Verse, + "v", + null, + null, + $"{rowScrRefs[0].Verse}-{rowScrRefs.Last().Verse}" + ) + ); + } + else + { + _tokens.Add(new UsfmToken(UsfmTokenType.Verse, "v", null, null, rowScrRefs[0].Verse)); + } + _tokens.Add(new UsfmToken(UsfmTokenType.Text, null, text, null)); + if (preserveLastVerse) + _tokens.Add(lastToken); + + UpdateVerseRef(rowScrRefs.Last().VerseRef, "v"); + } + + private void InsertRemainingRows() + { + while (_rowIndex < _rows.Count) + { + InsertOrphanRow(); + _rowIndex++; + } + } + private void CollectTokens(UsfmParserState state) { _tokens.AddRange(_newTokens); diff --git a/tests/SIL.Machine.Tests/Corpora/TestData/usfm/Tes/41MATTes.SFM b/tests/SIL.Machine.Tests/Corpora/TestData/usfm/Tes/41MATTes.SFM index 672b93da..e969b40c 100644 --- a/tests/SIL.Machine.Tests/Corpora/TestData/usfm/Tes/41MATTes.SFM +++ b/tests/SIL.Machine.Tests/Corpora/TestData/usfm/Tes/41MATTes.SFM @@ -54,6 +54,9 @@ \v 10 \tc3-4 \qt-s |Jesus\*Chapter 2 verse 10\qt-e\* \v 11-12 \restore restore information +\v 14 +\v 17 +\v 21 \c 3 \cl PSALM 3 \s1 Section 1 diff --git a/tests/SIL.Machine.Tests/Corpora/UsfmFileTextTests.cs b/tests/SIL.Machine.Tests/Corpora/UsfmFileTextTests.cs index 88fc10e1..bf012892 100644 --- a/tests/SIL.Machine.Tests/Corpora/UsfmFileTextTests.cs +++ b/tests/SIL.Machine.Tests/Corpora/UsfmFileTextTests.cs @@ -13,7 +13,7 @@ public void GetRows_NonEmptyText() IText text = corpus["MAT"]; TextRow[] rows = text.GetRows().ToArray(); - Assert.That(rows, Has.Length.EqualTo(24)); + Assert.That(rows, Has.Length.EqualTo(27)); Assert.That(rows[0].Ref, Is.EqualTo(ScriptureRef.Parse("MAT 1:1", corpus.Versification))); Assert.That(rows[0].Text, Is.EqualTo("Chapter one, verse one.")); @@ -73,7 +73,7 @@ public void GetRows_NonEmptyText_AllText() IText text = corpus["MAT"]; TextRow[] rows = text.GetRows().ToArray(); - Assert.That(rows, Has.Length.EqualTo(50)); + Assert.That(rows, Has.Length.EqualTo(53)); Assert.That(rows[0].Ref, Is.EqualTo(ScriptureRef.Parse("MAT 1:0/1:h", corpus.Versification))); Assert.That(rows[0].Text, Is.EqualTo("Matthew")); @@ -146,7 +146,7 @@ public void GetRows_SentenceStart() IText text = corpus["MAT"]; TextRow[] rows = text.GetRows().ToArray(); - Assert.That(rows, Has.Length.EqualTo(24)); + Assert.That(rows, Has.Length.EqualTo(27)); Assert.That(rows[3].Ref, Is.EqualTo(ScriptureRef.Parse("MAT 1:4", corpus.Versification))); Assert.That(rows[3].Text, Is.EqualTo("Chapter one, verse four,")); @@ -179,7 +179,7 @@ public void GetRows_IncludeMarkers() IText text = corpus["MAT"]; TextRow[] rows = text.GetRows().ToArray(); - Assert.That(rows, Has.Length.EqualTo(24)); + Assert.That(rows, Has.Length.EqualTo(27)); Assert.That(rows[0].Ref, Is.EqualTo(ScriptureRef.Parse("MAT 1:1", corpus.Versification))); Assert.That( @@ -254,7 +254,7 @@ public void GetRows_IncludeMarkers_AllText() IText text = corpus["MAT"]; TextRow[] rows = text.GetRows().ToArray(); - Assert.That(rows, Has.Length.EqualTo(46)); + Assert.That(rows, Has.Length.EqualTo(49)); Assert.That(rows[2].Ref, Is.EqualTo(ScriptureRef.Parse("MAT 1:0/3:ip", corpus.Versification))); Assert.That(rows[2].Text, Is.EqualTo("An introduction to Matthew\\fe + \\ft This is an endnote.\\fe*")); diff --git a/tests/SIL.Machine.Tests/Corpora/UsfmTextUpdaterTests.cs b/tests/SIL.Machine.Tests/Corpora/UsfmTextUpdaterTests.cs index e414f13b..89352fc1 100644 --- a/tests/SIL.Machine.Tests/Corpora/UsfmTextUpdaterTests.cs +++ b/tests/SIL.Machine.Tests/Corpora/UsfmTextUpdaterTests.cs @@ -189,6 +189,20 @@ public void GetUsfm_Verse_RangeMultipleRowsSingleVerse() ); } + [Test] + public void GetUsfm_Verse_2_2a_2b() + { + var rows = new List<(IReadOnlyList, string)> + { + (ScrRef("MAT 2:2"), "Verse 2."), + (ScrRef("MAT 2:2a"), "Verse 2a."), + (ScrRef("MAT 2:2b"), "Verse 2b.") + }; + + string target = UpdateUsfm(rows); + Assert.That(target, Contains.Substring("\\v 2-3 Verse 2. Verse 2a. Verse 2b.\r\n")); + } + [Test] public void GetUsfm_Verse_OptBreak() { @@ -375,6 +389,39 @@ public void GetUsfm_Verse_DoubleVaVp() ); } + [Test] + public void GetUsfm_Verse_VersesNotInTemplate() + { + var rows = new List<(IReadOnlyList, string)> + { + (ScrRef("MAT 2:11"), "Verse 11 1st part of range - include it."), + (ScrRef("MAT 2:12"), "Verse 12 2nd part of range - include it."), + (ScrRef("MAT 2:13"), "Verse 13 does not exist - include it."), + (ScrRef("MAT 2:14", "MAT 2:15"), "Verse 14 exists - include it."), + (ScrRef("MAT 2:16", "MAT 2:17"), "Verse 17 exists - include it."), + (ScrRef("MAT 2:18", "MAT 2:19"), "Verse 18-19 does not exist - include it."), + (ScrRef("MAT 2:20"), "Verse 20 does not exist - include it."), + (ScrRef("MAT 2:22/3:ip"), "This should not be included - only verse text."), + (ScrRef("MAT 5:3"), "Chapter 5?!?!? - include it.") + }; + + string target = UpdateUsfm(rows); + Assert.That( + target, + Contains.Substring( + "\\v 11-12 Verse 11 1st part of range - include it. Verse 12 2nd part of range - include it." + ) + ); + Assert.That(target, Contains.Substring("\\v 13 Verse 13 does not exist - include it.")); + Assert.That(target, Contains.Substring("\\v 14 Verse 14 exists - include it.")); + Assert.That(target, Contains.Substring("\\v 17 Verse 17 exists - include it.")); + Assert.That(target, Contains.Substring("\\v 17 Verse 17 exists - include it.")); + Assert.That(target, Contains.Substring("\\v 18-19 Verse 18-19 does not exist - include it.")); + Assert.That(target, Contains.Substring("\\v 20 Verse 20 does not exist - include it.")); + Assert.That(target, !Contains.Substring("This should not be included - only verse text.")); + Assert.That(target, Contains.Substring("\\c 5\r\n\\v 3 Chapter 5?!?!? - include it.")); + } + [Test] public void GetUsfm_Verse_LastVerse() { diff --git a/tests/SIL.Machine.Tests/Corpora/UsfmTokenizerTests.cs b/tests/SIL.Machine.Tests/Corpora/UsfmTokenizerTests.cs index 6137246a..a763f10c 100644 --- a/tests/SIL.Machine.Tests/Corpora/UsfmTokenizerTests.cs +++ b/tests/SIL.Machine.Tests/Corpora/UsfmTokenizerTests.cs @@ -11,7 +11,7 @@ public void Tokenize() string usfm = ReadUsfm(); var tokenizer = new UsfmTokenizer(); IReadOnlyList tokens = tokenizer.Tokenize(usfm); - Assert.That(tokens, Has.Count.EqualTo(236)); + Assert.That(tokens, Has.Count.EqualTo(239)); Assert.That(tokens[0].Type, Is.EqualTo(UsfmTokenType.Book)); Assert.That(tokens[0].Marker, Is.EqualTo("id"));