Skip to content

Commit

Permalink
More fixes
Browse files Browse the repository at this point in the history
  • Loading branch information
Enkidu93 committed Nov 14, 2024
1 parent 0d351f2 commit 14697e4
Show file tree
Hide file tree
Showing 6 changed files with 170 additions and 125 deletions.
93 changes: 16 additions & 77 deletions src/SIL.Machine/Corpora/CorporaExtensions.cs
Original file line number Diff line number Diff line change
Expand Up @@ -538,87 +538,26 @@ public static INParallelTextCorpus AlignMany(
return nParallelTextCorpus;
}

public static ITextCorpus ChooseRandom(this INParallelTextCorpus corpus, int seed)
{
return new MergedCorpus(corpus, MergeRule.Random, seed);
}

public static ITextCorpus ChooseFirst(this INParallelTextCorpus corpus)
{
return new MergedCorpus(corpus, MergeRule.First, 0);
}

private enum MergeRule
public static ITextCorpus ChooseRandom(
this IEnumerable<ITextCorpus> corpora,
IEnumerable<bool> allRows,
int seed
)
{
First,
Random
return new MergedTextCorpus(
new NParallelTextCorpus(corpora) { AllRows = allRows.ToArray() },
MergeRule.Random,
seed
);
}

private class MergedCorpus : TextCorpusBase
public static ITextCorpus ChooseFirst(this IEnumerable<ITextCorpus> corpora, IEnumerable<bool> allRows)
{
private readonly INParallelTextCorpus _corpus;

private readonly MergeRule _mergeRule;

private readonly Random _random;

public MergedCorpus(INParallelTextCorpus nParallelTextCorpus, MergeRule mergeRule, int seed)
{
_corpus = nParallelTextCorpus;
_mergeRule = mergeRule;
_random = new Random(seed);
}

public override IEnumerable<IText> Texts => _corpus.Corpora.SelectMany(c => c.Texts);

public override bool IsTokenized => Enumerable.Range(0, _corpus.N).All(i => _corpus.IsTokenized(i));

public override ScrVers Versification => _corpus.N > 0 ? _corpus.Corpora[0].Versification : null;

public override IEnumerable<TextRow> GetRows(IEnumerable<string> textIds)
{
int indexOfInRangeRow = -1;
foreach (NParallelTextRow nRow in _corpus.GetRows(textIds))
{
IReadOnlyList<int> nonEmptyIndices = nRow
.NSegments.Select((s, i) => (s, i))
.Where(pair => pair.s.Count > 0 || nRow.IsInRange(pair.i))
.Select(pair => pair.i)
.ToList();
IReadOnlyList<int> indices =
nonEmptyIndices.Count > 0 ? nonEmptyIndices : Enumerable.Range(0, nRow.N).ToList();
if (indexOfInRangeRow == -1)
{
indices = indices.Where(i => nRow.IsRangeStart(i) || !nRow.IsInRange(i)).ToList();
}
if (indices.Count == 0)
continue;
int indexOfSelectedRow = -1;
switch (_mergeRule)
{
case MergeRule.First:
indexOfSelectedRow = indices.First();
break;
case MergeRule.Random:
indexOfSelectedRow = indices[_random.Next(0, indices.Count)];
break;
}
indexOfSelectedRow = indexOfInRangeRow != -1 ? indexOfInRangeRow : indexOfSelectedRow;
if (!nRow.IsInRange(indexOfSelectedRow))
{
indexOfInRangeRow = -1;
}
if (nRow.IsRangeStart(indexOfSelectedRow))
{
indexOfInRangeRow = indexOfSelectedRow;
}
yield return new TextRow(nRow.TextId, nRow.Ref)
{
Segment = nRow.NSegments[indexOfSelectedRow],
Flags = nRow.NFlags[indexOfSelectedRow]
};
}
}
return new MergedTextCorpus(
new NParallelTextCorpus(corpora) { AllRows = allRows.ToArray() },
MergeRule.First,
0
);
}

#endregion
Expand Down
4 changes: 0 additions & 4 deletions src/SIL.Machine/Corpora/INParallelTextCorpus.cs
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,6 @@ namespace SIL.Machine.Corpora
{
public interface INParallelTextCorpus : ICorpus<NParallelTextRow>
{
int N { get; }
IReadOnlyList<ITextCorpus> Corpora { get; }

bool IsTokenized(int i);
int Count(bool includeEmpty = true, IEnumerable<string> textIds = null);

IEnumerable<NParallelTextRow> GetRows(IEnumerable<string> textIds);
Expand Down
8 changes: 8 additions & 0 deletions src/SIL.Machine/Corpora/MergeRule.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
namespace SIL.Machine.Corpora
{
public enum MergeRule
{
First,
Random
}
}
74 changes: 74 additions & 0 deletions src/SIL.Machine/Corpora/MergedTextCorpus.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
using System;
using System.Collections.Generic;
using System.Linq;
using SIL.Scripture;

namespace SIL.Machine.Corpora
{
public class MergedTextCorpus : TextCorpusBase
{
private readonly NParallelTextCorpus _corpus;

private readonly MergeRule _mergeRule;

private readonly Random _random;

public MergedTextCorpus(NParallelTextCorpus nParallelTextCorpus, MergeRule mergeRule, int seed)
{
_corpus = nParallelTextCorpus;
_mergeRule = mergeRule;
_random = new Random(seed);
}

public override IEnumerable<IText> Texts => _corpus.Corpora.SelectMany(c => c.Texts);

public override bool IsTokenized => Enumerable.Range(0, _corpus.N).All(i => _corpus.IsTokenized(i));

public override ScrVers Versification => _corpus.N > 0 ? _corpus.Corpora[0].Versification : null;

public override IEnumerable<TextRow> GetRows(IEnumerable<string> textIds)
{
int indexOfInRangeRow = -1;
foreach (NParallelTextRow nRow in _corpus.GetRows(textIds))
{
IReadOnlyList<int> nonEmptyIndices = nRow
.NSegments.Select((s, i) => (s, i))
.Where(pair => pair.s.Count > 0 || nRow.IsInRange(pair.i))
.Select(pair => pair.i)
.ToList();
IReadOnlyList<int> indices =
nonEmptyIndices.Count > 0 ? nonEmptyIndices : Enumerable.Range(0, nRow.N).ToList();
if (indexOfInRangeRow == -1)
{
indices = indices.Where(i => nRow.IsRangeStart(i) || !nRow.IsInRange(i)).ToList();
}
if (indices.Count == 0)
continue;
int indexOfSelectedRow = -1;
switch (_mergeRule)
{
case MergeRule.First:
indexOfSelectedRow = indices.First();
break;
case MergeRule.Random:
indexOfSelectedRow = indices[_random.Next(0, indices.Count)];
break;
}
indexOfSelectedRow = indexOfInRangeRow != -1 ? indexOfInRangeRow : indexOfSelectedRow;
if (!nRow.IsInRange(indexOfSelectedRow))
{
indexOfInRangeRow = -1;
}
if (nRow.IsRangeStart(indexOfSelectedRow))
{
indexOfInRangeRow = indexOfSelectedRow;
}
yield return new TextRow(nRow.TextId, nRow.Ref)
{
Segment = nRow.NSegments[indexOfSelectedRow],
Flags = nRow.NFlags[indexOfSelectedRow]
};
}
}
}
}
95 changes: 63 additions & 32 deletions src/SIL.Machine/Corpora/NParallelTextCorpus.cs
Original file line number Diff line number Diff line change
Expand Up @@ -106,9 +106,18 @@ private IEnumerable<NParallelTextRow> GetRows(IList<IEnumerator<TextRow>> enumer
RowRefComparer = RowRefComparer
};

bool[] completed = enumerators.Select(e => !e.MoveNext()).ToArray();
bool[] completed = new bool[N];
int numCompleted = 0;
for (int i = 0; i < N; i++)
{
bool isCompleted = !enumerators[i].MoveNext();
completed[i] = isCompleted;
if (isCompleted)
numCompleted++;
}
int numberOfRemainingRows = N - numCompleted;

while (!completed.All(c => c))
while (numCompleted < N)
{
List<int> minRefIndexes;
List<TextRow> currentRows = enumerators.Select(e => e.Current).ToList();
Expand All @@ -133,7 +142,6 @@ private IEnumerable<NParallelTextRow> GetRows(IList<IEnumerator<TextRow>> enumer
throw new CorpusAlignmentException(currentRows.Select(e => e.Ref.ToString()).ToArray());
}
List<int> nonMinRefIndexes = Enumerable.Range(0, N).Except(minRefIndexes).ToList();
int numberOfRemainingRows = N - completed.Count(c => c);
if (minRefIndexes.Count < numberOfRemainingRows || minRefIndexes.Count(i => !completed[i]) == 1)
//then there are some non-min refs or only one incomplete enumerator
{
Expand Down Expand Up @@ -172,9 +180,9 @@ NParallelTextRow row in CreateMinRefRows(
anyNonMinEnumeratorsMidRange
&& nonMinRefIndexes.All(j =>
!completed[j] && currentRows[j].TextId == currentRows[i].TextId
)
) //All non-min rows have the same textId as the given min row
)
.ToList() //TODO refactor
.ToList()
)
)
{
Expand All @@ -184,7 +192,13 @@ NParallelTextRow row in CreateMinRefRows(
foreach (int i in minRefIndexes)
{
rangeInfo.Rows[i].SameRefRows.Add(enumerators[i].Current);
completed[i] = !enumerators[i].MoveNext();
bool isCompleted = !enumerators[i].MoveNext();
completed[i] = isCompleted;
if (isCompleted)
{
numCompleted++;
numberOfRemainingRows--;
}
}
}
else if (minRefIndexes.Count == numberOfRemainingRows)
Expand Down Expand Up @@ -212,31 +226,11 @@ NParallelTextRow row in CreateMinRefRows(
}
else
{
for (int i = 0; i < rangeInfo.Rows.Count; i++)
foreach (NParallelTextRow row in CreateSameRefRows(rangeInfo, completed, currentRows))
{
if (completed[i])
continue;

for (int j = 0; j < rangeInfo.Rows.Count; j++)
{
if (i == j || completed[j])
continue;

if (CheckSameRefRows(rangeInfo.Rows[i].SameRefRows, currentRows[j]))
{
foreach (TextRow tr in rangeInfo.Rows[i].SameRefRows)
{
var textRows = new TextRow[N];
textRows[i] = tr;
textRows[j] = currentRows[j];
foreach (NParallelTextRow r in CreateRows(rangeInfo, textRows))
{
yield return r;
}
}
}
}
yield return row;
}

foreach (
NParallelTextRow row in CreateRows(
rangeInfo,
Expand All @@ -251,7 +245,13 @@ NParallelTextRow row in CreateRows(
for (int i = 0; i < rangeInfo.Rows.Count; i++)
{
rangeInfo.Rows[i].SameRefRows.Add(currentRows[i]);
completed[i] = !enumerators[i].MoveNext();
bool isCompleted = !enumerators[i].MoveNext();
completed[i] = isCompleted;
if (isCompleted)
{
numCompleted++;
numberOfRemainingRows--;
}
}
}
else
Expand Down Expand Up @@ -338,8 +338,6 @@ private IEnumerable<NParallelTextRow> CreateMinRefRows(
TextRow textRow = currentRows[i];
foreach ((List<TextRow> sameRefRows, int j) in sameRefRowsPerIndex)
{
if (i == j)
continue;
if (CheckSameRefRows(sameRefRows, textRow))
{
alreadyYielded.Add(i);
Expand Down Expand Up @@ -391,6 +389,39 @@ private bool CheckSameRefRows(IList<TextRow> sameRefRows, TextRow otherRow)
return sameRefRows.Count > 0;
}

private IEnumerable<NParallelTextRow> CreateSameRefRows(
NRangeInfo rangeInfo,
IList<bool> completed,
IList<TextRow> currentRows
)
{
for (int i = 0; i < rangeInfo.Rows.Count; i++)
{
if (completed[i])
continue;

for (int j = 0; j < rangeInfo.Rows.Count; j++)
{
if (i == j || completed[j])
continue;

if (CheckSameRefRows(rangeInfo.Rows[i].SameRefRows, currentRows[j]))
{
foreach (TextRow tr in rangeInfo.Rows[i].SameRefRows)
{
var textRows = new TextRow[N];
textRows[i] = tr;
textRows[j] = currentRows[j];
foreach (NParallelTextRow r in CreateRows(rangeInfo, textRows))
{
yield return r;
}
}
}
}
}
}

private class RangeRow
{
public IList<object> Refs { get; } = new List<object>();
Expand Down
Loading

0 comments on commit 14697e4

Please sign in to comment.