Skip to content

Commit

Permalink
Ignore ellipses (#469)
Browse files Browse the repository at this point in the history
* Filter out ellipses segments

* Test pretranslation content
  • Loading branch information
Enkidu93 authored Aug 29, 2024
1 parent 8857819 commit a111c67
Show file tree
Hide file tree
Showing 4 changed files with 78 additions and 11 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -268,8 +268,8 @@ ITextCorpus trgCorpus
IEnumerable<string>? textIds = corpus.TrainOnChapters is not null
? corpus.TrainOnChapters.Keys
: corpus.TrainOnTextIds;
srcCorpora = srcCorpora.Select(sc => sc.FilterTexts(textIds)).ToArray();
trgCorpus = trgCorpus.FilterTexts(textIds);
srcCorpora = srcCorpora.Select(sc => sc.FilterTexts(textIds).Transform(CleanSegment)).ToArray();
trgCorpus = trgCorpus.FilterTexts(textIds).Transform(CleanSegment);

if (trgCorpus.IsScripture())
{
Expand Down Expand Up @@ -389,8 +389,8 @@ private static IEnumerable<Row> AlignPretranslateCorpus(Corpus corpus, ITextCorp
IEnumerable<string>? textIds = corpus.PretranslateChapters is not null
? corpus.PretranslateChapters.Keys
: corpus.PretranslateTextIds;
srcCorpus = srcCorpus.FilterTexts(textIds);
trgCorpus = trgCorpus.FilterTexts(textIds);
srcCorpus = srcCorpus.FilterTexts(textIds).Transform(CleanSegment);
trgCorpus = trgCorpus.FilterTexts(textIds).Transform(CleanSegment);
int rowCount = 0;
StringBuilder srcSegBuffer = new();
StringBuilder trgSegBuffer = new();
Expand Down Expand Up @@ -446,4 +446,11 @@ protected virtual bool ResolveLanguageCodeForBaseModel(string languageCode, out
resolvedCode = languageCode;
return true;
}

private static TextRow CleanSegment(TextRow row)
{
if (row.Text == "...")
row.Segment = [];
return row;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -188,12 +188,12 @@ public async Task RunAsync_MixedSource_Paratext()
(int src1Count, int src2Count, int trgCount, int termCount) = await env.GetTrainCountAsync();
Assert.Multiple(() =>
{
Assert.That(src1Count, Is.EqualTo(4));
Assert.That(src1Count, Is.EqualTo(5));
Assert.That(src2Count, Is.EqualTo(12));
Assert.That(trgCount, Is.EqualTo(1));
Assert.That(termCount, Is.EqualTo(0));
});
Assert.That(await env.GetPretranslateCountAsync(), Is.EqualTo(12));
Assert.That(await env.GetPretranslateCountAsync(), Is.EqualTo(13));
}

[Test]
Expand Down Expand Up @@ -242,6 +242,46 @@ public async Task RunAsync_UnknownLanguageTagsNoDataSmtTransfer()
await env.RunBuildJobAsync(corpus1, engineId: "engine2", engineType: TranslationEngineType.SmtTransfer);
}

[Test]
public async Task RunAsync_RemoveFreestandingEllipses()
{
using TestEnvironment env = new();
Corpus corpus1 = env.DefaultParatextCorpus with
{
TrainOnChapters = new Dictionary<string, HashSet<int>>
{
{
"MAT",
new HashSet<int>() { 2 }
}
},
PretranslateChapters = new Dictionary<string, HashSet<int>>
{
{
"MAT",
new HashSet<int>() { 2 }
}
}
};
await env.RunBuildJobAsync(corpus1, useKeyTerms: false);
string sourceExtract = await env.GetSourceExtractAsync();
Assert.That(
sourceExtract,
Is.EqualTo("Source one, chapter two, verse one.\nSource one, chapter two, verse two.\n\n"),
sourceExtract
);
string targetExtract = await env.GetTargetExtractAsync();
Assert.That(
targetExtract,
Is.EqualTo("Target one, chapter two, verse one.\n\nTarget one, chapter two, verse three.\n"),
targetExtract
);
JsonArray? pretranslations = await env.GetPretranslationAsync();
Assert.That(pretranslations, Is.Not.Null);
Assert.That(pretranslations.Count, Is.EqualTo(1));
Assert.That(pretranslations[0]!["translation"]!.ToString(), Is.EqualTo("Source one, chapter two, verse two."));
}

[Test]
public void RunAsync_OnlyParseSelectedBooks_NoBadBooks()
{
Expand Down Expand Up @@ -581,6 +621,18 @@ public Task RunBuildJobAsync(
.RunAsync(engineId, "build1", [corpus], useKeyTerms ? null : "{\"use_key_terms\":false}", default);
}

public async Task<string> GetSourceExtractAsync()
{
using StreamReader srcReader = new(await SharedFileService.OpenReadAsync("builds/build1/train.src.txt"));
return await srcReader.ReadToEndAsync();
}

public async Task<string> GetTargetExtractAsync()
{
using StreamReader trgReader = new(await SharedFileService.OpenReadAsync("builds/build1/train.trg.txt"));
return await trgReader.ReadToEndAsync();
}

public async Task<(int Source1Count, int Source2Count, int TargetCount, int TermCount)> GetTrainCountAsync()
{
using StreamReader srcReader = new(await SharedFileService.OpenReadAsync("builds/build1/train.src.txt"));
Expand Down Expand Up @@ -610,12 +662,16 @@ public Task RunBuildJobAsync(
return (src1Count, src2Count, trgCount, termCount);
}

public async Task<int> GetPretranslateCountAsync()
public async Task<JsonArray?> GetPretranslationAsync()
{
using StreamReader reader =
new(await SharedFileService.OpenReadAsync("builds/build1/pretranslate.src.json"));
JsonArray? pretranslationJsonObject = JsonSerializer.Deserialize<JsonArray>(await reader.ReadToEndAsync());
return pretranslationJsonObject?.Count ?? 0;
return JsonSerializer.Deserialize<JsonArray>(await reader.ReadToEndAsync());
}

public async Task<int> GetPretranslateCountAsync()
{
return (await GetPretranslationAsync())?.Count ?? 0;
}

private void ZipParatextProject(string name)
Expand Down Expand Up @@ -659,7 +715,7 @@ private class DummyCorpus(IEnumerable<string> books, IEnumerable<string> failsOn
new List<TextRow>() { new TextRow(b, new ScriptureRef(new VerseRef("MAT", "1", "1", ScrVers.English))) }
));

public bool IsTokenized => throw new NotImplementedException();
public bool IsTokenized => false;

public ScrVers Versification => ScrVers.English;

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,3 +14,6 @@
\c 2
\p
\v 1 Source one, chapter two, verse one.
\v 2 Source one, chapter two, verse two.
\v 3 ...
\v 4 ...
Original file line number Diff line number Diff line change
Expand Up @@ -14,4 +14,5 @@
\c 2
\p
\v 1 Target one, chapter two, verse one.
\v 2 Target one, chapter two, verse two.
\v 2 ...
\v 3 Target one, chapter two, verse three.

0 comments on commit a111c67

Please sign in to comment.