Skip to content

Commit

Permalink
Skip parsing excluded books when preprocessing
Browse files Browse the repository at this point in the history
  • Loading branch information
ddaspit committed Jun 6, 2024
1 parent afc6328 commit 622ffe5
Show file tree
Hide file tree
Showing 13 changed files with 350 additions and 143 deletions.
62 changes: 39 additions & 23 deletions src/SIL.Machine.AspNetCore/Services/NmtPreprocessBuildJob.cs
Original file line number Diff line number Diff line change
Expand Up @@ -141,15 +141,15 @@ CancellationToken cancellationToken
continue;

int skipCount = 0;
foreach (Row?[] rows in AlignTrainCorpus(sourceTextCorpora, targetTextCorpus))
foreach (Row?[] rows in AlignTrainCorpus(corpus, sourceTextCorpora, targetTextCorpus))
{
if (skipCount > 0)
{
skipCount--;
continue;
}

Row[] trainRows = rows.Where(r => r is not null && IsInTrain(r, corpus)).Cast<Row>().ToArray();
Row[] trainRows = rows.Where(r => IsIncluded(r, corpus.TrainOnChapters)).Cast<Row>().ToArray();
if (trainRows.Length > 0)
{
Row row = trainRows[0];
Expand Down Expand Up @@ -184,10 +184,10 @@ CancellationToken cancellationToken
}
}

foreach (Row row in AlignPretranslateCorpus(sourceTextCorpora[0], targetTextCorpus))
foreach (Row row in AlignPretranslateCorpus(corpus, sourceTextCorpora[0], targetTextCorpus))
{
if (
IsInPretranslate(row, corpus)
IsIncluded(row, corpus.PretranslateChapters)
&& row.SourceSegment.Length > 0
&& (row.TargetSegment.Length == 0 || !IsInTrain(row, corpus))
)
Expand Down Expand Up @@ -233,27 +233,21 @@ JobCompletionStatus completionStatus

private static bool IsInTrain(Row row, Corpus corpus)
{
return IsIncluded(row, corpus.TrainOnAll, corpus.TrainOnTextIds, corpus.TrainOnChapters);
}

private static bool IsInPretranslate(Row row, Corpus corpus)
{
return IsIncluded(row, corpus.PretranslateAll, corpus.PretranslateTextIds, corpus.PretranslateChapters);
if (corpus.TrainOnChapters is not null)
{
if (row.Refs.Any(r => IsInChapters(corpus.TrainOnChapters, r)))
return true;
}
return corpus.TrainOnAll || corpus.TrainOnTextIds.Contains(row.TextId);
}

private static bool IsIncluded(
Row row,
bool all,
IReadOnlySet<string> textIds,
IReadOnlyDictionary<string, HashSet<int>>? chapters
)
private static bool IsIncluded(Row? row, IReadOnlyDictionary<string, HashSet<int>>? chapters)
{
if (row is null)
return false;
if (chapters is not null)
{
if (row.Refs.Any(r => IsInChapters(chapters, r)))
return true;
}
return all || textIds.Contains(row.TextId);
return row.Refs.Any(r => IsInChapters(chapters, r));
return true;
}

private static bool IsInChapters(IReadOnlyDictionary<string, HashSet<int>> bookChapters, object rowRef)
Expand All @@ -264,8 +258,21 @@ private static bool IsInChapters(IReadOnlyDictionary<string, HashSet<int>> bookC
&& (chapters.Contains(sr.ChapterNum) || chapters.Count == 0);
}

private static IEnumerable<Row?[]> AlignTrainCorpus(IReadOnlyList<ITextCorpus> srcCorpora, ITextCorpus trgCorpus)
private static IEnumerable<Row?[]> AlignTrainCorpus(
Corpus corpus,
IReadOnlyList<ITextCorpus> srcCorpora,
ITextCorpus trgCorpus
)
{
if (!corpus.TrainOnAll)
{
IEnumerable<string> textIds = corpus.TrainOnChapters is not null
? corpus.TrainOnChapters.Keys
: corpus.TrainOnTextIds;
srcCorpora = srcCorpora.Select(sc => sc.FilterTexts(textIds)).ToArray();
trgCorpus = trgCorpus.FilterTexts(textIds);
}

if (trgCorpus.IsScripture())
{
return srcCorpora
Expand Down Expand Up @@ -379,8 +386,17 @@ private static bool IsInChapters(IReadOnlyDictionary<string, HashSet<int>> bookC
}
}

private static IEnumerable<Row> AlignPretranslateCorpus(ITextCorpus srcCorpus, ITextCorpus trgCorpus)
private static IEnumerable<Row> AlignPretranslateCorpus(Corpus corpus, ITextCorpus srcCorpus, ITextCorpus trgCorpus)
{
if (!corpus.PretranslateAll)
{
IEnumerable<string> textIds = corpus.PretranslateChapters is not null
? corpus.PretranslateChapters.Keys
: corpus.PretranslateTextIds;
srcCorpus = srcCorpus.FilterTexts(textIds);
trgCorpus = trgCorpus.FilterTexts(textIds);
}

int rowCount = 0;
StringBuilder srcSegBuffer = new();
StringBuilder trgSegBuffer = new();
Expand Down
30 changes: 26 additions & 4 deletions src/SIL.Machine/Corpora/AlignmentCorpusBase.cs
Original file line number Diff line number Diff line change
@@ -1,16 +1,38 @@
using System.Collections.Generic;
using System.Collections;
using System.Collections.Generic;
using System.Linq;

namespace SIL.Machine.Corpora
{
public abstract class AlignmentCorpusBase : CorpusBase<AlignmentRow>, IAlignmentCorpus
public abstract class AlignmentCorpusBase : IAlignmentCorpus
{
public abstract IEnumerable<IAlignmentCollection> AlignmentCollections { get; }

public override IEnumerable<AlignmentRow> GetRows()
public virtual int Count(bool includeEmpty = true, IEnumerable<string> textIds = null)
{
return includeEmpty ? GetRows(textIds).Count() : GetRows(textIds).Count(r => !r.IsEmpty);
}

int ICorpus<AlignmentRow>.Count(bool includeEmpty)
{
return Count(includeEmpty, null);
}

public IEnumerable<AlignmentRow> GetRows()
{
return GetRows(null);
}

public abstract IEnumerable<AlignmentRow> GetRows(IEnumerable<string> alignmentCollectionIds);
public abstract IEnumerable<AlignmentRow> GetRows(IEnumerable<string> textIds);

public IEnumerator<AlignmentRow> GetEnumerator()
{
return GetRows().GetEnumerator();
}

IEnumerator IEnumerable.GetEnumerator()
{
return GetEnumerator();
}
}
}
Loading

0 comments on commit 622ffe5

Please sign in to comment.