From d97ea1cf7dbf644510659096f67fcb35302a8444 Mon Sep 17 00:00:00 2001 From: Enkidu93 Date: Wed, 13 Nov 2024 19:29:48 -0500 Subject: [PATCH] Remove alignment corpus from NPTC; move logic to PTC --- .../Corpora/NParallelTextCorpus.cs | 45 ++------------ src/SIL.Machine/Corpora/NParallelTextRow.cs | 2 - src/SIL.Machine/Corpora/ParallelTextCorpus.cs | 58 +++++++++++++------ 3 files changed, 44 insertions(+), 61 deletions(-) diff --git a/src/SIL.Machine/Corpora/NParallelTextCorpus.cs b/src/SIL.Machine/Corpora/NParallelTextCorpus.cs index bc5210c4..4d59fcac 100644 --- a/src/SIL.Machine/Corpora/NParallelTextCorpus.cs +++ b/src/SIL.Machine/Corpora/NParallelTextCorpus.cs @@ -26,7 +26,6 @@ public override bool IsTokenized(int i) => public override int N => Corpora.Count; public IReadOnlyList AllRows { get; set; } public override IReadOnlyList Corpora { get; } - public IAlignmentCorpus AlignmentCorpus { get; set; } public IComparer RowRefComparer { get; } private HashSet GetTextIdsFromCorpora() @@ -53,7 +52,6 @@ public override IEnumerable GetRows(IEnumerable textId if (textIds != null) filterTextIds.IntersectWith(textIds); - IEnumerator alignmentEnumerator = null; List> enumeratedCorpora = new List>(); try { @@ -64,10 +62,7 @@ public override IEnumerable GetRows(IEnumerable textId new TextCorpusEnumerator(enumerator, Corpora[0].Versification, Corpora[i].Versification) ); } - - if (AlignmentCorpus != null) - alignmentEnumerator = AlignmentCorpus.GetRows(filterTextIds).GetEnumerator(); - foreach (NParallelTextRow row in GetRows(enumeratedCorpora, alignmentEnumerator)) + foreach (NParallelTextRow row in GetRows(enumeratedCorpora)) yield return row; } finally @@ -76,7 +71,6 @@ public override IEnumerable GetRows(IEnumerable textId { enumerator.Dispose(); } - alignmentEnumerator?.Dispose(); } } @@ -105,10 +99,7 @@ private IList MinRefIndexes(IList refs) return minRefIndexes; } - private IEnumerable GetRows( - IList> enumerators, - IEnumerator alignmentEnumerator - ) + private IEnumerable GetRows(IList> enumerators) { var rangeInfo = new NRangeInfo(N) { @@ -204,27 +195,6 @@ NParallelTextRow row in CreateMinRefRows( else if (minRefIndexes.Count == numberOfRemainingRows) // the refs are all the same { - int compareAlignmentCorpus = -1; - if (AlignmentCorpus != null) - { - do - { - try - { - compareAlignmentCorpus = alignmentEnumerator.MoveNext() - ? RowRefComparer.Compare( - currentIncompleteRows[0].Ref, - alignmentEnumerator.Current.Ref - ) - : 1; - } - catch (ArgumentException) - { - throw new CorpusAlignmentException(currentRows.Select(e => e.Ref.ToString()).ToArray()); - } - } while (compareAlignmentCorpus < 0); - } - if ( minRefIndexes .Select(i => @@ -271,10 +241,7 @@ NParallelTextRow row in CreateMinRefRows( foreach ( NParallelTextRow row in CreateRows( rangeInfo, - currentRows.Select((r, i) => completed[i] ? null : r).ToArray(), - alignedWordPairs: AlignmentCorpus != null && compareAlignmentCorpus == 0 - ? alignmentEnumerator.Current.AlignedWordPairs.ToArray() - : null + currentRows.Select((r, i) => completed[i] ? null : r).ToArray() ) ) { @@ -313,8 +280,7 @@ private object[] CorrectVersification(object[] refs, int i) private IEnumerable CreateRows( NRangeInfo rangeInfo, IReadOnlyList rows, - IReadOnlyList forceInRange = null, - IReadOnlyList alignedWordPairs = null + IReadOnlyList forceInRange = null ) { if (rangeInfo.IsInRange) @@ -351,8 +317,7 @@ private IEnumerable CreateRows( yield return new NParallelTextRow(textId, refs) { NSegments = rows.Select(r => r?.Segment ?? Array.Empty()).ToArray(), - NFlags = flags.ToReadOnlyList(), - AlignedWordPairs = alignedWordPairs + NFlags = flags.ToReadOnlyList() }; } diff --git a/src/SIL.Machine/Corpora/NParallelTextRow.cs b/src/SIL.Machine/Corpora/NParallelTextRow.cs index e76c57d9..4d58e907 100644 --- a/src/SIL.Machine/Corpora/NParallelTextRow.cs +++ b/src/SIL.Machine/Corpora/NParallelTextRow.cs @@ -46,8 +46,6 @@ public bool IsRangeStart(int i) => public string Text(int i) => string.Join(" ", NSegments[i]); - public IReadOnlyCollection AlignedWordPairs { get; set; } - public NParallelTextRow Invert() { return new NParallelTextRow(TextId, NRefs.Reverse()) { NFlags = NFlags.Reverse().ToImmutableArray(), }; diff --git a/src/SIL.Machine/Corpora/ParallelTextCorpus.cs b/src/SIL.Machine/Corpora/ParallelTextCorpus.cs index f21acc7c..89e8d6c1 100644 --- a/src/SIL.Machine/Corpora/ParallelTextCorpus.cs +++ b/src/SIL.Machine/Corpora/ParallelTextCorpus.cs @@ -1,4 +1,6 @@ -using System.Collections.Generic; +using System; +using System.Collections.Generic; +using System.Linq; namespace SIL.Machine.Corpora { @@ -15,10 +17,7 @@ public ParallelTextCorpus( TargetCorpus = targetCorpus; AlignmentCorpus = alignmentCorpus ?? new DictionaryAlignmentCorpus(); RowRefComparer = rowRefComparer ?? new NParallelTextCorpus.DefaultRowRefComparer(); - NParallelTextCorpus = new NParallelTextCorpus(new List { SourceCorpus, TargetCorpus }) - { - AlignmentCorpus = AlignmentCorpus - }; + NParallelTextCorpus = new NParallelTextCorpus(new List { SourceCorpus, TargetCorpus }); } public override bool IsSourceTokenized => SourceCorpus.IsTokenized; @@ -36,22 +35,43 @@ public ParallelTextCorpus( public override IEnumerable GetRows(IEnumerable textIds) { - NParallelTextCorpus.AllRows = new bool[] { AllSourceRows, AllTargetRows }; - bool isScripture = SourceCorpus.IsScripture() && TargetCorpus.IsScripture(); - foreach (var nRow in NParallelTextCorpus.GetRows(textIds)) + using (IEnumerator alignmentEnumerator = AlignmentCorpus.GetEnumerator()) { - yield return new ParallelTextRow( - nRow.TextId, - nRow.NRefs[0].Count > 0 || !isScripture ? nRow.NRefs[0] : new object[] { nRow.Ref }, - nRow.NRefs[1].Count > 0 || !isScripture ? nRow.NRefs[1] : new object[] { nRow.Ref } - ) + NParallelTextCorpus.AllRows = new bool[] { AllSourceRows, AllTargetRows }; + bool isScripture = SourceCorpus.IsScripture() && TargetCorpus.IsScripture(); + foreach (var nRow in NParallelTextCorpus.GetRows(textIds)) { - SourceFlags = nRow.NFlags[0], - TargetFlags = nRow.NFlags[1], - SourceSegment = nRow.NSegments[0], - TargetSegment = nRow.NSegments[1], - AlignedWordPairs = nRow.AlignedWordPairs - }; + int compareAlignmentCorpus = -1; + if (AlignmentCorpus != null && nRow.NSegments.All(s => s.Count > 0)) + { + do + { + try + { + compareAlignmentCorpus = alignmentEnumerator.MoveNext() + ? RowRefComparer.Compare(nRow.Ref, alignmentEnumerator.Current.Ref) + : 1; + } + catch (ArgumentException) + { + throw new CorpusAlignmentException(nRow.NRefs.Select(r => r.ToString()).ToArray()); + } + } while (compareAlignmentCorpus < 0); + } + yield return new ParallelTextRow( + nRow.TextId, + nRow.NRefs[0].Count > 0 || !isScripture ? nRow.NRefs[0] : new object[] { nRow.Ref }, + nRow.NRefs[1].Count > 0 || !isScripture ? nRow.NRefs[1] : new object[] { nRow.Ref } + ) + { + SourceFlags = nRow.NFlags[0], + TargetFlags = nRow.NFlags[1], + SourceSegment = nRow.NSegments[0], + TargetSegment = nRow.NSegments[1], + AlignedWordPairs = + compareAlignmentCorpus == 0 ? alignmentEnumerator.Current.AlignedWordPairs.ToArray() : null + }; + } } } }