Skip to content

Commit

Permalink
Remove alignment corpus from NPTC; move logic to PTC
Browse files Browse the repository at this point in the history
  • Loading branch information
Enkidu93 committed Nov 14, 2024
1 parent 54ae315 commit d97ea1c
Show file tree
Hide file tree
Showing 3 changed files with 44 additions and 61 deletions.
45 changes: 5 additions & 40 deletions src/SIL.Machine/Corpora/NParallelTextCorpus.cs
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,6 @@ public override bool IsTokenized(int i) =>
public override int N => Corpora.Count;
public IReadOnlyList<bool> AllRows { get; set; }
public override IReadOnlyList<ITextCorpus> Corpora { get; }
public IAlignmentCorpus AlignmentCorpus { get; set; }
public IComparer<object> RowRefComparer { get; }

private HashSet<string> GetTextIdsFromCorpora()
Expand All @@ -53,7 +52,6 @@ public override IEnumerable<NParallelTextRow> GetRows(IEnumerable<string> textId
if (textIds != null)
filterTextIds.IntersectWith(textIds);

IEnumerator<AlignmentRow> alignmentEnumerator = null;
List<IEnumerator<TextRow>> enumeratedCorpora = new List<IEnumerator<TextRow>>();
try
{
Expand All @@ -64,10 +62,7 @@ public override IEnumerable<NParallelTextRow> GetRows(IEnumerable<string> textId
new TextCorpusEnumerator(enumerator, Corpora[0].Versification, Corpora[i].Versification)
);
}

if (AlignmentCorpus != null)
alignmentEnumerator = AlignmentCorpus.GetRows(filterTextIds).GetEnumerator();
foreach (NParallelTextRow row in GetRows(enumeratedCorpora, alignmentEnumerator))
foreach (NParallelTextRow row in GetRows(enumeratedCorpora))
yield return row;
}
finally
Expand All @@ -76,7 +71,6 @@ public override IEnumerable<NParallelTextRow> GetRows(IEnumerable<string> textId
{
enumerator.Dispose();
}
alignmentEnumerator?.Dispose();
}
}

Expand Down Expand Up @@ -105,10 +99,7 @@ private IList<int> MinRefIndexes(IList<object> refs)
return minRefIndexes;
}

private IEnumerable<NParallelTextRow> GetRows(
IList<IEnumerator<TextRow>> enumerators,
IEnumerator<AlignmentRow> alignmentEnumerator
)
private IEnumerable<NParallelTextRow> GetRows(IList<IEnumerator<TextRow>> enumerators)
{
var rangeInfo = new NRangeInfo(N)
{
Expand Down Expand Up @@ -204,27 +195,6 @@ NParallelTextRow row in CreateMinRefRows(
else if (minRefIndexes.Count == numberOfRemainingRows)
// the refs are all the same
{
int compareAlignmentCorpus = -1;
if (AlignmentCorpus != null)
{
do
{
try
{
compareAlignmentCorpus = alignmentEnumerator.MoveNext()
? RowRefComparer.Compare(
currentIncompleteRows[0].Ref,
alignmentEnumerator.Current.Ref
)
: 1;
}
catch (ArgumentException)
{
throw new CorpusAlignmentException(currentRows.Select(e => e.Ref.ToString()).ToArray());
}
} while (compareAlignmentCorpus < 0);
}

if (
minRefIndexes
.Select(i =>
Expand Down Expand Up @@ -271,10 +241,7 @@ NParallelTextRow row in CreateMinRefRows(
foreach (
NParallelTextRow row in CreateRows(
rangeInfo,
currentRows.Select((r, i) => completed[i] ? null : r).ToArray(),
alignedWordPairs: AlignmentCorpus != null && compareAlignmentCorpus == 0
? alignmentEnumerator.Current.AlignedWordPairs.ToArray()
: null
currentRows.Select((r, i) => completed[i] ? null : r).ToArray()
)
)
{
Expand Down Expand Up @@ -313,8 +280,7 @@ private object[] CorrectVersification(object[] refs, int i)
private IEnumerable<NParallelTextRow> CreateRows(
NRangeInfo rangeInfo,
IReadOnlyList<TextRow> rows,
IReadOnlyList<bool> forceInRange = null,
IReadOnlyList<AlignedWordPair> alignedWordPairs = null
IReadOnlyList<bool> forceInRange = null
)
{
if (rangeInfo.IsInRange)
Expand Down Expand Up @@ -351,8 +317,7 @@ private IEnumerable<NParallelTextRow> CreateRows(
yield return new NParallelTextRow(textId, refs)
{
NSegments = rows.Select(r => r?.Segment ?? Array.Empty<string>()).ToArray(),
NFlags = flags.ToReadOnlyList(),
AlignedWordPairs = alignedWordPairs
NFlags = flags.ToReadOnlyList()
};
}

Expand Down
2 changes: 0 additions & 2 deletions src/SIL.Machine/Corpora/NParallelTextRow.cs
Original file line number Diff line number Diff line change
Expand Up @@ -46,8 +46,6 @@ public bool IsRangeStart(int i) =>

public string Text(int i) => string.Join(" ", NSegments[i]);

public IReadOnlyCollection<AlignedWordPair> AlignedWordPairs { get; set; }

public NParallelTextRow Invert()
{
return new NParallelTextRow(TextId, NRefs.Reverse()) { NFlags = NFlags.Reverse().ToImmutableArray(), };
Expand Down
58 changes: 39 additions & 19 deletions src/SIL.Machine/Corpora/ParallelTextCorpus.cs
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
using System.Collections.Generic;
using System;
using System.Collections.Generic;
using System.Linq;

namespace SIL.Machine.Corpora
{
Expand All @@ -15,10 +17,7 @@ public ParallelTextCorpus(
TargetCorpus = targetCorpus;
AlignmentCorpus = alignmentCorpus ?? new DictionaryAlignmentCorpus();
RowRefComparer = rowRefComparer ?? new NParallelTextCorpus.DefaultRowRefComparer();
NParallelTextCorpus = new NParallelTextCorpus(new List<ITextCorpus> { SourceCorpus, TargetCorpus })
{
AlignmentCorpus = AlignmentCorpus
};
NParallelTextCorpus = new NParallelTextCorpus(new List<ITextCorpus> { SourceCorpus, TargetCorpus });
}

public override bool IsSourceTokenized => SourceCorpus.IsTokenized;
Expand All @@ -36,22 +35,43 @@ public ParallelTextCorpus(

public override IEnumerable<ParallelTextRow> GetRows(IEnumerable<string> textIds)
{
NParallelTextCorpus.AllRows = new bool[] { AllSourceRows, AllTargetRows };
bool isScripture = SourceCorpus.IsScripture() && TargetCorpus.IsScripture();
foreach (var nRow in NParallelTextCorpus.GetRows(textIds))
using (IEnumerator<AlignmentRow> alignmentEnumerator = AlignmentCorpus.GetEnumerator())
{
yield return new ParallelTextRow(
nRow.TextId,
nRow.NRefs[0].Count > 0 || !isScripture ? nRow.NRefs[0] : new object[] { nRow.Ref },
nRow.NRefs[1].Count > 0 || !isScripture ? nRow.NRefs[1] : new object[] { nRow.Ref }
)
NParallelTextCorpus.AllRows = new bool[] { AllSourceRows, AllTargetRows };
bool isScripture = SourceCorpus.IsScripture() && TargetCorpus.IsScripture();
foreach (var nRow in NParallelTextCorpus.GetRows(textIds))
{
SourceFlags = nRow.NFlags[0],
TargetFlags = nRow.NFlags[1],
SourceSegment = nRow.NSegments[0],
TargetSegment = nRow.NSegments[1],
AlignedWordPairs = nRow.AlignedWordPairs
};
int compareAlignmentCorpus = -1;
if (AlignmentCorpus != null && nRow.NSegments.All(s => s.Count > 0))
{
do
{
try
{
compareAlignmentCorpus = alignmentEnumerator.MoveNext()
? RowRefComparer.Compare(nRow.Ref, alignmentEnumerator.Current.Ref)
: 1;
}
catch (ArgumentException)
{
throw new CorpusAlignmentException(nRow.NRefs.Select(r => r.ToString()).ToArray());
}
} while (compareAlignmentCorpus < 0);
}
yield return new ParallelTextRow(
nRow.TextId,
nRow.NRefs[0].Count > 0 || !isScripture ? nRow.NRefs[0] : new object[] { nRow.Ref },
nRow.NRefs[1].Count > 0 || !isScripture ? nRow.NRefs[1] : new object[] { nRow.Ref }
)
{
SourceFlags = nRow.NFlags[0],
TargetFlags = nRow.NFlags[1],
SourceSegment = nRow.NSegments[0],
TargetSegment = nRow.NSegments[1],
AlignedWordPairs =
compareAlignmentCorpus == 0 ? alignmentEnumerator.Current.AlignedWordPairs.ToArray() : null
};
}
}
}
}
Expand Down

0 comments on commit d97ea1c

Please sign in to comment.