From 14697e44ecbae90351cda05ab094ff13aef52d89 Mon Sep 17 00:00:00 2001 From: Enkidu93 Date: Thu, 14 Nov 2024 15:32:05 -0500 Subject: [PATCH] More fixes --- src/SIL.Machine/Corpora/CorporaExtensions.cs | 93 ++++-------------- .../Corpora/INParallelTextCorpus.cs | 4 - src/SIL.Machine/Corpora/MergeRule.cs | 8 ++ src/SIL.Machine/Corpora/MergedTextCorpus.cs | 74 +++++++++++++++ .../Corpora/NParallelTextCorpus.cs | 95 ++++++++++++------- .../Corpora/CorporaExtensionsTests.cs | 21 ++-- 6 files changed, 170 insertions(+), 125 deletions(-) create mode 100644 src/SIL.Machine/Corpora/MergeRule.cs create mode 100644 src/SIL.Machine/Corpora/MergedTextCorpus.cs diff --git a/src/SIL.Machine/Corpora/CorporaExtensions.cs b/src/SIL.Machine/Corpora/CorporaExtensions.cs index 38ad693c..33186799 100644 --- a/src/SIL.Machine/Corpora/CorporaExtensions.cs +++ b/src/SIL.Machine/Corpora/CorporaExtensions.cs @@ -538,87 +538,26 @@ public static INParallelTextCorpus AlignMany( return nParallelTextCorpus; } - public static ITextCorpus ChooseRandom(this INParallelTextCorpus corpus, int seed) - { - return new MergedCorpus(corpus, MergeRule.Random, seed); - } - - public static ITextCorpus ChooseFirst(this INParallelTextCorpus corpus) - { - return new MergedCorpus(corpus, MergeRule.First, 0); - } - - private enum MergeRule + public static ITextCorpus ChooseRandom( + this IEnumerable corpora, + IEnumerable allRows, + int seed + ) { - First, - Random + return new MergedTextCorpus( + new NParallelTextCorpus(corpora) { AllRows = allRows.ToArray() }, + MergeRule.Random, + seed + ); } - private class MergedCorpus : TextCorpusBase + public static ITextCorpus ChooseFirst(this IEnumerable corpora, IEnumerable allRows) { - private readonly INParallelTextCorpus _corpus; - - private readonly MergeRule _mergeRule; - - private readonly Random _random; - - public MergedCorpus(INParallelTextCorpus nParallelTextCorpus, MergeRule mergeRule, int seed) - { - _corpus = nParallelTextCorpus; - _mergeRule = mergeRule; - _random = new Random(seed); - } - - public override IEnumerable Texts => _corpus.Corpora.SelectMany(c => c.Texts); - - public override bool IsTokenized => Enumerable.Range(0, _corpus.N).All(i => _corpus.IsTokenized(i)); - - public override ScrVers Versification => _corpus.N > 0 ? _corpus.Corpora[0].Versification : null; - - public override IEnumerable GetRows(IEnumerable textIds) - { - int indexOfInRangeRow = -1; - foreach (NParallelTextRow nRow in _corpus.GetRows(textIds)) - { - IReadOnlyList nonEmptyIndices = nRow - .NSegments.Select((s, i) => (s, i)) - .Where(pair => pair.s.Count > 0 || nRow.IsInRange(pair.i)) - .Select(pair => pair.i) - .ToList(); - IReadOnlyList indices = - nonEmptyIndices.Count > 0 ? nonEmptyIndices : Enumerable.Range(0, nRow.N).ToList(); - if (indexOfInRangeRow == -1) - { - indices = indices.Where(i => nRow.IsRangeStart(i) || !nRow.IsInRange(i)).ToList(); - } - if (indices.Count == 0) - continue; - int indexOfSelectedRow = -1; - switch (_mergeRule) - { - case MergeRule.First: - indexOfSelectedRow = indices.First(); - break; - case MergeRule.Random: - indexOfSelectedRow = indices[_random.Next(0, indices.Count)]; - break; - } - indexOfSelectedRow = indexOfInRangeRow != -1 ? indexOfInRangeRow : indexOfSelectedRow; - if (!nRow.IsInRange(indexOfSelectedRow)) - { - indexOfInRangeRow = -1; - } - if (nRow.IsRangeStart(indexOfSelectedRow)) - { - indexOfInRangeRow = indexOfSelectedRow; - } - yield return new TextRow(nRow.TextId, nRow.Ref) - { - Segment = nRow.NSegments[indexOfSelectedRow], - Flags = nRow.NFlags[indexOfSelectedRow] - }; - } - } + return new MergedTextCorpus( + new NParallelTextCorpus(corpora) { AllRows = allRows.ToArray() }, + MergeRule.First, + 0 + ); } #endregion diff --git a/src/SIL.Machine/Corpora/INParallelTextCorpus.cs b/src/SIL.Machine/Corpora/INParallelTextCorpus.cs index 0dfde2fa..5a1e86f7 100644 --- a/src/SIL.Machine/Corpora/INParallelTextCorpus.cs +++ b/src/SIL.Machine/Corpora/INParallelTextCorpus.cs @@ -4,10 +4,6 @@ namespace SIL.Machine.Corpora { public interface INParallelTextCorpus : ICorpus { - int N { get; } - IReadOnlyList Corpora { get; } - - bool IsTokenized(int i); int Count(bool includeEmpty = true, IEnumerable textIds = null); IEnumerable GetRows(IEnumerable textIds); diff --git a/src/SIL.Machine/Corpora/MergeRule.cs b/src/SIL.Machine/Corpora/MergeRule.cs new file mode 100644 index 00000000..be9a2cee --- /dev/null +++ b/src/SIL.Machine/Corpora/MergeRule.cs @@ -0,0 +1,8 @@ +namespace SIL.Machine.Corpora +{ + public enum MergeRule + { + First, + Random + } +} diff --git a/src/SIL.Machine/Corpora/MergedTextCorpus.cs b/src/SIL.Machine/Corpora/MergedTextCorpus.cs new file mode 100644 index 00000000..890f7e10 --- /dev/null +++ b/src/SIL.Machine/Corpora/MergedTextCorpus.cs @@ -0,0 +1,74 @@ +using System; +using System.Collections.Generic; +using System.Linq; +using SIL.Scripture; + +namespace SIL.Machine.Corpora +{ + public class MergedTextCorpus : TextCorpusBase + { + private readonly NParallelTextCorpus _corpus; + + private readonly MergeRule _mergeRule; + + private readonly Random _random; + + public MergedTextCorpus(NParallelTextCorpus nParallelTextCorpus, MergeRule mergeRule, int seed) + { + _corpus = nParallelTextCorpus; + _mergeRule = mergeRule; + _random = new Random(seed); + } + + public override IEnumerable Texts => _corpus.Corpora.SelectMany(c => c.Texts); + + public override bool IsTokenized => Enumerable.Range(0, _corpus.N).All(i => _corpus.IsTokenized(i)); + + public override ScrVers Versification => _corpus.N > 0 ? _corpus.Corpora[0].Versification : null; + + public override IEnumerable GetRows(IEnumerable textIds) + { + int indexOfInRangeRow = -1; + foreach (NParallelTextRow nRow in _corpus.GetRows(textIds)) + { + IReadOnlyList nonEmptyIndices = nRow + .NSegments.Select((s, i) => (s, i)) + .Where(pair => pair.s.Count > 0 || nRow.IsInRange(pair.i)) + .Select(pair => pair.i) + .ToList(); + IReadOnlyList indices = + nonEmptyIndices.Count > 0 ? nonEmptyIndices : Enumerable.Range(0, nRow.N).ToList(); + if (indexOfInRangeRow == -1) + { + indices = indices.Where(i => nRow.IsRangeStart(i) || !nRow.IsInRange(i)).ToList(); + } + if (indices.Count == 0) + continue; + int indexOfSelectedRow = -1; + switch (_mergeRule) + { + case MergeRule.First: + indexOfSelectedRow = indices.First(); + break; + case MergeRule.Random: + indexOfSelectedRow = indices[_random.Next(0, indices.Count)]; + break; + } + indexOfSelectedRow = indexOfInRangeRow != -1 ? indexOfInRangeRow : indexOfSelectedRow; + if (!nRow.IsInRange(indexOfSelectedRow)) + { + indexOfInRangeRow = -1; + } + if (nRow.IsRangeStart(indexOfSelectedRow)) + { + indexOfInRangeRow = indexOfSelectedRow; + } + yield return new TextRow(nRow.TextId, nRow.Ref) + { + Segment = nRow.NSegments[indexOfSelectedRow], + Flags = nRow.NFlags[indexOfSelectedRow] + }; + } + } + } +} diff --git a/src/SIL.Machine/Corpora/NParallelTextCorpus.cs b/src/SIL.Machine/Corpora/NParallelTextCorpus.cs index ae4c6e1f..9b5bf777 100644 --- a/src/SIL.Machine/Corpora/NParallelTextCorpus.cs +++ b/src/SIL.Machine/Corpora/NParallelTextCorpus.cs @@ -106,9 +106,18 @@ private IEnumerable GetRows(IList> enumer RowRefComparer = RowRefComparer }; - bool[] completed = enumerators.Select(e => !e.MoveNext()).ToArray(); + bool[] completed = new bool[N]; + int numCompleted = 0; + for (int i = 0; i < N; i++) + { + bool isCompleted = !enumerators[i].MoveNext(); + completed[i] = isCompleted; + if (isCompleted) + numCompleted++; + } + int numberOfRemainingRows = N - numCompleted; - while (!completed.All(c => c)) + while (numCompleted < N) { List minRefIndexes; List currentRows = enumerators.Select(e => e.Current).ToList(); @@ -133,7 +142,6 @@ private IEnumerable GetRows(IList> enumer throw new CorpusAlignmentException(currentRows.Select(e => e.Ref.ToString()).ToArray()); } List nonMinRefIndexes = Enumerable.Range(0, N).Except(minRefIndexes).ToList(); - int numberOfRemainingRows = N - completed.Count(c => c); if (minRefIndexes.Count < numberOfRemainingRows || minRefIndexes.Count(i => !completed[i]) == 1) //then there are some non-min refs or only one incomplete enumerator { @@ -172,9 +180,9 @@ NParallelTextRow row in CreateMinRefRows( anyNonMinEnumeratorsMidRange && nonMinRefIndexes.All(j => !completed[j] && currentRows[j].TextId == currentRows[i].TextId - ) + ) //All non-min rows have the same textId as the given min row ) - .ToList() //TODO refactor + .ToList() ) ) { @@ -184,7 +192,13 @@ NParallelTextRow row in CreateMinRefRows( foreach (int i in minRefIndexes) { rangeInfo.Rows[i].SameRefRows.Add(enumerators[i].Current); - completed[i] = !enumerators[i].MoveNext(); + bool isCompleted = !enumerators[i].MoveNext(); + completed[i] = isCompleted; + if (isCompleted) + { + numCompleted++; + numberOfRemainingRows--; + } } } else if (minRefIndexes.Count == numberOfRemainingRows) @@ -212,31 +226,11 @@ NParallelTextRow row in CreateMinRefRows( } else { - for (int i = 0; i < rangeInfo.Rows.Count; i++) + foreach (NParallelTextRow row in CreateSameRefRows(rangeInfo, completed, currentRows)) { - if (completed[i]) - continue; - - for (int j = 0; j < rangeInfo.Rows.Count; j++) - { - if (i == j || completed[j]) - continue; - - if (CheckSameRefRows(rangeInfo.Rows[i].SameRefRows, currentRows[j])) - { - foreach (TextRow tr in rangeInfo.Rows[i].SameRefRows) - { - var textRows = new TextRow[N]; - textRows[i] = tr; - textRows[j] = currentRows[j]; - foreach (NParallelTextRow r in CreateRows(rangeInfo, textRows)) - { - yield return r; - } - } - } - } + yield return row; } + foreach ( NParallelTextRow row in CreateRows( rangeInfo, @@ -251,7 +245,13 @@ NParallelTextRow row in CreateRows( for (int i = 0; i < rangeInfo.Rows.Count; i++) { rangeInfo.Rows[i].SameRefRows.Add(currentRows[i]); - completed[i] = !enumerators[i].MoveNext(); + bool isCompleted = !enumerators[i].MoveNext(); + completed[i] = isCompleted; + if (isCompleted) + { + numCompleted++; + numberOfRemainingRows--; + } } } else @@ -338,8 +338,6 @@ private IEnumerable CreateMinRefRows( TextRow textRow = currentRows[i]; foreach ((List sameRefRows, int j) in sameRefRowsPerIndex) { - if (i == j) - continue; if (CheckSameRefRows(sameRefRows, textRow)) { alreadyYielded.Add(i); @@ -391,6 +389,39 @@ private bool CheckSameRefRows(IList sameRefRows, TextRow otherRow) return sameRefRows.Count > 0; } + private IEnumerable CreateSameRefRows( + NRangeInfo rangeInfo, + IList completed, + IList currentRows + ) + { + for (int i = 0; i < rangeInfo.Rows.Count; i++) + { + if (completed[i]) + continue; + + for (int j = 0; j < rangeInfo.Rows.Count; j++) + { + if (i == j || completed[j]) + continue; + + if (CheckSameRefRows(rangeInfo.Rows[i].SameRefRows, currentRows[j])) + { + foreach (TextRow tr in rangeInfo.Rows[i].SameRefRows) + { + var textRows = new TextRow[N]; + textRows[i] = tr; + textRows[j] = currentRows[j]; + foreach (NParallelTextRow r in CreateRows(rangeInfo, textRows)) + { + yield return r; + } + } + } + } + } + } + private class RangeRow { public IList Refs { get; } = new List(); diff --git a/tests/SIL.Machine.Tests/Corpora/CorporaExtensionsTests.cs b/tests/SIL.Machine.Tests/Corpora/CorporaExtensionsTests.cs index 2f8ec3a5..db5e85ac 100644 --- a/tests/SIL.Machine.Tests/Corpora/CorporaExtensionsTests.cs +++ b/tests/SIL.Machine.Tests/Corpora/CorporaExtensionsTests.cs @@ -94,8 +94,7 @@ public void MergedCorpus_SelectFirst() } ) ); - var nParallelCorpus = new NParallelTextCorpus([corpus1, corpus2, corpus3]) { AllRows = [true, true, true] }; - var mergedCorpus = nParallelCorpus.ChooseFirst(); + var mergedCorpus = new List { corpus1, corpus2, corpus3 }.ChooseFirst([true, true, true]); var rows = mergedCorpus.ToArray(); Assert.That(rows, Has.Length.EqualTo(3), JsonSerializer.Serialize(rows)); Assert.That(rows[0].Text, Is.EqualTo("source 1 segment 1 .")); @@ -139,8 +138,7 @@ public void MergedCorpus_SelectRandom_Seed123456() } ) ); - var nParallelCorpus = new NParallelTextCorpus([corpus1, corpus2, corpus3]) { AllRows = [true, true, true] }; - var mergedCorpus = nParallelCorpus.ChooseRandom(123456); + var mergedCorpus = new List { corpus1, corpus2, corpus3 }.ChooseRandom([true, true, true], 123456); var rows = mergedCorpus.ToArray(); Assert.That(rows, Has.Length.EqualTo(3), JsonSerializer.Serialize(rows)); Assert.Multiple(() => @@ -187,8 +185,7 @@ public void MergedCorpus_SelectRandom_Seed4501() } ) ); - var nParallelCorpus = new NParallelTextCorpus([corpus1, corpus2, corpus3]) { AllRows = [true, true, true] }; - var mergedCorpus = nParallelCorpus.ChooseRandom(4501); + var mergedCorpus = new List { corpus1, corpus2, corpus3 }.ChooseRandom([true, true, true], 4501); var rows = mergedCorpus.ToArray(); Assert.That(rows, Has.Length.EqualTo(3), JsonSerializer.Serialize(rows)); Assert.Multiple(() => @@ -236,9 +233,9 @@ public void AlignMergedCorpora() ) ); - ITextCorpus sourceCorpus = (new ITextCorpus[] { sourceCorpus1, sourceCorpus1, sourceCorpus3 }) - .AlignMany([true, true, true]) - .ChooseFirst(); + ITextCorpus sourceCorpus = new List { sourceCorpus1, sourceCorpus2, sourceCorpus3 }.ChooseFirst( + [true, true, true] + ); var targetCorpus1 = new DictionaryTextCorpus( new MemoryText( @@ -274,9 +271,9 @@ public void AlignMergedCorpora() ) ); - ITextCorpus targetCorpus = (new ITextCorpus[] { targetCorpus1, targetCorpus2, targetCorpus3 }) - .AlignMany([true, true, true]) - .ChooseFirst(); + ITextCorpus targetCorpus = new List { targetCorpus1, targetCorpus2, targetCorpus3 }.ChooseFirst( + [true, true, true] + ); IParallelTextCorpus alignedCorpus = sourceCorpus.AlignRows(targetCorpus); ParallelTextRow[] rows = alignedCorpus.GetRows().ToArray();