Skip to content

Commit

Permalink
NParallelTextCorpus (#270)
Browse files Browse the repository at this point in the history
* Working NParallelTextCorpus
* Change mergedtextcorpus parameter
* Move same ref rows out of range info
* Make seed optional; remove unneeded code
  • Loading branch information
Enkidu93 authored Nov 25, 2024
1 parent 9720b46 commit 9724d44
Show file tree
Hide file tree
Showing 13 changed files with 1,741 additions and 599 deletions.
27 changes: 27 additions & 0 deletions src/SIL.Machine/Corpora/CorporaExtensions.cs
Original file line number Diff line number Diff line change
Expand Up @@ -523,6 +523,33 @@ public override IEnumerable<TextRow> GetRows(IEnumerable<string> textIds)

#endregion

#region INParallelTextCorpus operations

public static INParallelTextCorpus AlignMany(
this IEnumerable<ITextCorpus> corpora,
IEnumerable<bool> allRowsPerCorpus = null
)
{
NParallelTextCorpus nParallelTextCorpus = new NParallelTextCorpus(corpora);
if (allRowsPerCorpus != null)
{
nParallelTextCorpus.AllRows = allRowsPerCorpus.ToArray();
}
return nParallelTextCorpus;
}

public static ITextCorpus ChooseRandom(this IEnumerable<ITextCorpus> corpora, int? seed = null)
{
return new MergedTextCorpus(corpora, MergeRule.Random, seed);
}

public static ITextCorpus ChooseFirst(this IEnumerable<ITextCorpus> corpora)
{
return new MergedTextCorpus(corpora, MergeRule.First);
}

#endregion

#region IAlignmentCorpus operations

public static IAlignmentCorpus Transform(
Expand Down
5 changes: 5 additions & 0 deletions src/SIL.Machine/Corpora/CorpusAlignmentException.cs
Original file line number Diff line number Diff line change
Expand Up @@ -8,5 +8,10 @@ public CorpusAlignmentException(string sourceRef, string targetRef)
: base(
$"Invalid format in {sourceRef} and {targetRef}. Mismatched key formats \"{sourceRef}\" and \"{targetRef}\". There may be an extraneous tab, missing ref, or inconsistent use of user-defined refs."
) { }

public CorpusAlignmentException(string[] refs)
: base(
$"Invalid format in {string.Join(", ", refs)}. Mismatched key formats. There may be an extraneous tab, missing ref, or inconsistent use of user-defined refs."
) { }
}
}
11 changes: 11 additions & 0 deletions src/SIL.Machine/Corpora/INParallelTextCorpus.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
using System.Collections.Generic;

namespace SIL.Machine.Corpora
{
public interface INParallelTextCorpus : ICorpus<NParallelTextRow>
{
int Count(bool includeEmpty = true, IEnumerable<string> textIds = null);

IEnumerable<NParallelTextRow> GetRows(IEnumerable<string> textIds);
}
}
8 changes: 8 additions & 0 deletions src/SIL.Machine/Corpora/MergeRule.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
namespace SIL.Machine.Corpora
{
public enum MergeRule
{
First,
Random
}
}
77 changes: 77 additions & 0 deletions src/SIL.Machine/Corpora/MergedTextCorpus.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
using System;
using System.Collections.Generic;
using System.Linq;
using SIL.Scripture;

namespace SIL.Machine.Corpora
{
public class MergedTextCorpus : TextCorpusBase
{
private readonly NParallelTextCorpus _corpus;

private readonly MergeRule _mergeRule;

private readonly Random _random;

public MergedTextCorpus(IEnumerable<ITextCorpus> corpora, MergeRule mergeRule, int? seed = null)
{
_corpus = new NParallelTextCorpus(corpora) { AllRows = Enumerable.Repeat(true, corpora.Count()).ToArray() };
_mergeRule = mergeRule;
if (seed != null)
_random = new Random(seed.Value);
else
_random = new Random();
}

public override IEnumerable<IText> Texts => _corpus.Corpora.SelectMany(c => c.Texts);

public override bool IsTokenized => Enumerable.Range(0, _corpus.N).All(i => _corpus.IsTokenized(i));

public override ScrVers Versification => _corpus.N > 0 ? _corpus.Corpora[0].Versification : null;

public override IEnumerable<TextRow> GetRows(IEnumerable<string> textIds)
{
int indexOfInRangeRow = -1;
foreach (NParallelTextRow nRow in _corpus.GetRows(textIds))
{
IReadOnlyList<int> nonEmptyIndices = nRow
.NSegments.Select((s, i) => (s, i))
.Where(pair => pair.s.Count > 0 || nRow.IsInRange(pair.i))
.Select(pair => pair.i)
.ToList();
IReadOnlyList<int> indices =
nonEmptyIndices.Count > 0 ? nonEmptyIndices : Enumerable.Range(0, nRow.N).ToList();
if (indexOfInRangeRow == -1)
{
indices = indices.Where(i => nRow.IsRangeStart(i) || !nRow.IsInRange(i)).ToList();
}
if (indices.Count == 0)
continue;
int indexOfSelectedRow = -1;
switch (_mergeRule)
{
case MergeRule.First:
indexOfSelectedRow = indices.First();
break;
case MergeRule.Random:
indexOfSelectedRow = indices[_random.Next(0, indices.Count)];
break;
}
indexOfSelectedRow = indexOfInRangeRow != -1 ? indexOfInRangeRow : indexOfSelectedRow;
if (!nRow.IsInRange(indexOfSelectedRow))
{
indexOfInRangeRow = -1;
}
if (nRow.IsRangeStart(indexOfSelectedRow))
{
indexOfInRangeRow = indexOfSelectedRow;
}
yield return new TextRow(nRow.TextId, nRow.Ref)
{
Segment = nRow.NSegments[indexOfSelectedRow],
Flags = nRow.NFlags[indexOfSelectedRow]
};
}
}
}
}
Loading

0 comments on commit 9724d44

Please sign in to comment.